Friday, June 8, 2018

Python - Paying attention when you want to copy/clone a list

if you do b = a,

you didn’t copy the list referenced by a. You just created a new tag and attached it to the list pointed by a. Like in the picture below:
a and b reference the same list
If you modify a, you also modify b, since they point to the same list.

Now we want to copy the list referenced by a. We need to create a new list to attach b to it.
a and b reference different lists

b = a[:]
b = a.copy()

Python -Start scraping - 7. extract data from DBF file without download file

..........

zipfile = ZipFile(io.BytesIO(body))

filenames = [y for y in sorted(zipfile.namelist()) for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
dbf, prj, shp, shx = [io.BytesIO(zipfile.read(filename)) for filename in filenames]

rshp = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)
fields = rshp.fields[1:]
field_names = [field[0] for field in fields]

data_1 = []
data_2 = []
for r in rshp.shapeRecords():
    r = dict(zip(field_names, r.record))
............

Python -Start scraping - 6. pandas unpivot data

from ftplib import FTP
import zipfile
import csv
import pandas as pd

filename = 'XXXX.zip'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip filewith zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()

# read csv filewith open("XXXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_14')
        data[i]['operator_code'] = data[i].pop('Operator_Number')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['county_code'] = data[i].pop('API_County')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Rng')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['meridian'] = data[i].pop('PM')
        data[i]['surface_latitude'] = data[i].pop('Lat_Y')
        data[i]['surface_longitude'] = data[i].pop('Long_X')
        data[i]['otc_county'] = data[i].pop('OTC_County')
        data[i]['otc_pun'] = data[i].pop('OTC_PUN')
        data[i]['reservoir_code'] = data[i].pop('Formation_Code')
        data[i]['reservoir'] = data[i].pop('Formation_Name')
        data[i]['production_year'] = data[i].pop('Reporting_Year')
        data[i]['01-01'] = data[i].pop('January_Manuf')
        data[i]['02-01'] =  data[i].pop('February_Manuf')
        data[i]['03-01'] = data[i].pop('March_Manuf')
        data[i]['04-01'] = data[i].pop('April_Manuf')
        data[i]['05-01'] = data[i].pop('May_Manuf')
        data[i]['06-01'] = data[i].pop('June_Manuf')
        data[i]['07-01'] = data[i].pop('July_Manuf')
        data[i]['08-01'] = data[i].pop('August_Manuf')
        data[i]['09-01'] =  data[i].pop('September_Manuf')
        data[i]['10-01'] = data[i].pop('October_Manuf')
        data[i]['11-01'] = data[i].pop('November_Manuf')
        data[i]['12-01'] = data[i].pop('December_Manuf')
        data[i].pop('API_Base')
        data[i].pop('Reporter_Number')
        data[i].pop('Reporter_Name')
        data[i].pop('Q4')
        data[i].pop('Q3')
        data[i].pop('Q2')
        data[i].pop('Q1')

    data_prod = []

    for i in range(0,len(data)):
        # create dataframe        df = pd.DataFrame(dict(data[i]),index = [0])

        df_unpivot = pd.melt(df, id_vars=['api', 'operator_code','operator', 'well_name',
                             'well_number','county_code', 'township', 'range',
                             'section', 'meridian','surface_latitude',
                             'surface_longitude','otc_county', 'otc_pun','reservoir_code',
                             'reservoir', 'production_year'],
                             var_name='production_month', value_name='gas')
        # upivot dataframe to list        data_prod = df_unpivot.to_dict(orient='records')

        # insert key and value to list element(dict)        for i in range(0,len(data_prod)):
            data_prod[i]['gas_unit'] = 'Mcf'            data_prod[i]['production_date'] = data_prod[i]['production_year']+'-' + data_prod[i]['production_month']
            data_prod[i]['surface_datum'] = 'NAD 83'            data_prod[i]['comment'] = 'assumed surface datum'

            data_prod[i].pop('production_year')
            data_prod[i].pop('production_month')
        print(data_prod)

Python -Start scraping - 5. connect to ftp website

from ftplib import FTP
import zipfile
import csv

filename = 'XXX.ZIP'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip file    with zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()
    
# read csv filewith open("XXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_Number')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['operator_code'] = data[i].pop('Operator')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_status'] = data[i].pop('Well_Status')
        data[i]['well_type'] = data[i].pop('Well_Type')
        data[i]['county_code'] = data[i].pop('County')
        data[i]['location_datum'] = data[i].pop('Loc_Type')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Range')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['principle_meridian'] = data[i].pop('PM')
        data[i]['latitude'] = data[i].pop('Lat_Y')
        data[i]['longitude'] = data[i].pop('Long_X')
        data[i]['measured_depth'] = data[i].pop('Measured_TotalDepth')
        data[i]['vertical_depth'] =  data[i].pop('TrueVertical_Depth')
        data[i]['spud_date'] = data[i].pop('Spud')

Python -Start scraping - 4. connect to HTTPS website

import http.client
from bs4 import BeautifulSoup
import re
import io
import zipfile
import csv

# Find substring within a stringdef find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""    host = 'XXXX'path_orig = '/XXXX/XXXX/XXXX'
conn = http.client.HTTPSConnection(host)

# get the latest file pathconn.request('GET',path_orig)
r = conn.getresponse()

soup = BeautifulSoup(r,'html.parser')
links = soup.findAll('a')

for link in links:
    prod_link = link.get('href')

    match = re.search(r'pub/Database/Production', str(prod_link))
    if match:
        path = prod_link
        file_name = find_between(prod_link,'/pub/Database/','zip') + 'csv'
# Getconn.request('GET', path)   
r = conn.getresponse()
 
# read zip filewith zipfile.ZipFile(io.BytesIO(r.read()), "r") as zf:
    zf.extractall()
    
with open(file_name) as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []  
    
    for r in reader:
        data.append(r)
        
    for i in range(0,len(data)):
        data[i]['production_date'] = data[i].pop('ReportPeriod')
        data[i]['operator_current'] = data[i].pop('Operator')
        data[i]['api'] = data[i].pop('API')
        data[i]['wellbore'] = data[i].pop('WellBore')
        data[i]['reservoir'] = data[i].pop('FormationName').strip()
        data[i]['well_type'] = data[i].pop('WellType')
        data[i]['production_days'] = data[i].pop('DaysProd')
        data[i]['oil'] = data[i].pop('Oil')
        data[i]['gas'] = data[i].pop('Gas')
        data[i]['water'] = data[i].pop('Water')      
        data[i].pop('IsAmended')
        data[i].pop('Oper_No')
        data[i].pop('Old_Oper_No')
        data[i].pop('Entity')
        data[i].pop('WellStatus')
    
   # print(data)