Friday, June 8, 2018

Python -Start scraping - 6. pandas unpivot data

from ftplib import FTP
import zipfile
import csv
import pandas as pd

filename = 'XXXX.zip'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip filewith zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()

# read csv filewith open("XXXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_14')
        data[i]['operator_code'] = data[i].pop('Operator_Number')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['county_code'] = data[i].pop('API_County')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Rng')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['meridian'] = data[i].pop('PM')
        data[i]['surface_latitude'] = data[i].pop('Lat_Y')
        data[i]['surface_longitude'] = data[i].pop('Long_X')
        data[i]['otc_county'] = data[i].pop('OTC_County')
        data[i]['otc_pun'] = data[i].pop('OTC_PUN')
        data[i]['reservoir_code'] = data[i].pop('Formation_Code')
        data[i]['reservoir'] = data[i].pop('Formation_Name')
        data[i]['production_year'] = data[i].pop('Reporting_Year')
        data[i]['01-01'] = data[i].pop('January_Manuf')
        data[i]['02-01'] =  data[i].pop('February_Manuf')
        data[i]['03-01'] = data[i].pop('March_Manuf')
        data[i]['04-01'] = data[i].pop('April_Manuf')
        data[i]['05-01'] = data[i].pop('May_Manuf')
        data[i]['06-01'] = data[i].pop('June_Manuf')
        data[i]['07-01'] = data[i].pop('July_Manuf')
        data[i]['08-01'] = data[i].pop('August_Manuf')
        data[i]['09-01'] =  data[i].pop('September_Manuf')
        data[i]['10-01'] = data[i].pop('October_Manuf')
        data[i]['11-01'] = data[i].pop('November_Manuf')
        data[i]['12-01'] = data[i].pop('December_Manuf')
        data[i].pop('API_Base')
        data[i].pop('Reporter_Number')
        data[i].pop('Reporter_Name')
        data[i].pop('Q4')
        data[i].pop('Q3')
        data[i].pop('Q2')
        data[i].pop('Q1')

    data_prod = []

    for i in range(0,len(data)):
        # create dataframe        df = pd.DataFrame(dict(data[i]),index = [0])

        df_unpivot = pd.melt(df, id_vars=['api', 'operator_code','operator', 'well_name',
                             'well_number','county_code', 'township', 'range',
                             'section', 'meridian','surface_latitude',
                             'surface_longitude','otc_county', 'otc_pun','reservoir_code',
                             'reservoir', 'production_year'],
                             var_name='production_month', value_name='gas')
        # upivot dataframe to list        data_prod = df_unpivot.to_dict(orient='records')

        # insert key and value to list element(dict)        for i in range(0,len(data_prod)):
            data_prod[i]['gas_unit'] = 'Mcf'            data_prod[i]['production_date'] = data_prod[i]['production_year']+'-' + data_prod[i]['production_month']
            data_prod[i]['surface_datum'] = 'NAD 83'            data_prod[i]['comment'] = 'assumed surface datum'

            data_prod[i].pop('production_year')
            data_prod[i].pop('production_month')
        print(data_prod)

No comments:

Post a Comment