Friday, June 8, 2018

Python -Start scraping - 4. connect to HTTPS website

import http.client
from bs4 import BeautifulSoup
import re
import io
import zipfile
import csv

# Find substring within a stringdef find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""    host = 'XXXX'path_orig = '/XXXX/XXXX/XXXX'
conn = http.client.HTTPSConnection(host)

# get the latest file pathconn.request('GET',path_orig)
r = conn.getresponse()

soup = BeautifulSoup(r,'html.parser')
links = soup.findAll('a')

for link in links:
    prod_link = link.get('href')

    match = re.search(r'pub/Database/Production', str(prod_link))
    if match:
        path = prod_link
        file_name = find_between(prod_link,'/pub/Database/','zip') + 'csv'
# Getconn.request('GET', path)   
r = conn.getresponse()
 
# read zip filewith zipfile.ZipFile(io.BytesIO(r.read()), "r") as zf:
    zf.extractall()
    
with open(file_name) as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []  
    
    for r in reader:
        data.append(r)
        
    for i in range(0,len(data)):
        data[i]['production_date'] = data[i].pop('ReportPeriod')
        data[i]['operator_current'] = data[i].pop('Operator')
        data[i]['api'] = data[i].pop('API')
        data[i]['wellbore'] = data[i].pop('WellBore')
        data[i]['reservoir'] = data[i].pop('FormationName').strip()
        data[i]['well_type'] = data[i].pop('WellType')
        data[i]['production_days'] = data[i].pop('DaysProd')
        data[i]['oil'] = data[i].pop('Oil')
        data[i]['gas'] = data[i].pop('Gas')
        data[i]['water'] = data[i].pop('Water')      
        data[i].pop('IsAmended')
        data[i].pop('Oper_No')
        data[i].pop('Old_Oper_No')
        data[i].pop('Entity')
        data[i].pop('WellStatus')
    
   # print(data)

No comments:

Post a Comment