import http.client from bs4 import BeautifulSoup import re import io import zipfile import csv # Find substring within a stringdef find_between(s, first, last): try: start = s.index(first) + len(first) end = s.index(last, start) return s[start:end] except ValueError: return "" host = 'XXXX'path_orig = '/XXXX/XXXX/XXXX' conn = http.client.HTTPSConnection(host) # get the latest file pathconn.request('GET',path_orig) r = conn.getresponse() soup = BeautifulSoup(r,'html.parser') links = soup.findAll('a') for link in links: prod_link = link.get('href') match = re.search(r'pub/Database/Production', str(prod_link)) if match: path = prod_link file_name = find_between(prod_link,'/pub/Database/','zip') + 'csv' # Getconn.request('GET', path) r = conn.getresponse() # read zip filewith zipfile.ZipFile(io.BytesIO(r.read()), "r") as zf: zf.extractall() with open(file_name) as f: reader = csv.DictReader(f) next(reader) data = [] for r in reader: data.append(r) for i in range(0,len(data)): data[i]['production_date'] = data[i].pop('ReportPeriod') data[i]['operator_current'] = data[i].pop('Operator') data[i]['api'] = data[i].pop('API') data[i]['wellbore'] = data[i].pop('WellBore') data[i]['reservoir'] = data[i].pop('FormationName').strip() data[i]['well_type'] = data[i].pop('WellType') data[i]['production_days'] = data[i].pop('DaysProd') data[i]['oil'] = data[i].pop('Oil') data[i]['gas'] = data[i].pop('Gas') data[i]['water'] = data[i].pop('Water') data[i].pop('IsAmended') data[i].pop('Oper_No') data[i].pop('Old_Oper_No') data[i].pop('Entity') data[i].pop('WellStatus') # print(data)
Friday, June 8, 2018
Python -Start scraping - 4. connect to HTTPS website
Labels:
Python
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment