Monday, July 16, 2018

Python - sorted(list) vs. list.sort()

sorted() returns a new sorted list, leaving the original list unaffected. list.sort() sorts the list in-place, mutating the list indices, and returns None (like all in-place operations).
sorted() works on any iterable, not just lists. Strings, tuples, dictionaries (you'll get the keys), generators, etc., returning a list containing all elements, sorted.
  • Use list.sort() when you want to mutate the list, sorted() when you want a new sorted object back. Use sorted() when you want to sort something that is an iterable, not a list yet.
  • For lists, list.sort() is faster than sorted() because it doesn't have to create a copy. For any other iterable, you have no choice.
  • You cannot retrieve the original positions. Once you called list.sort() the original order is gone.

Python - some useful small functions

# find substring within a string
def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""

# find index of certain charactor
def find(str, ch):
    for i, ltr in enumerate(str):
        if ltr == ch:
            yield i


# reformat epoch_time to YYYY-mm-dd
def format_date(data):
    if data == None or data == 'null' or find_between(data, "(", ")") == '':
        return ''    
    else:
        epoch_time = int(find_between(data,"(",")"))
        if epoch_time < 0 and epoch_time >= -1262304000:
            data = datetime(1970, 1, 1) + timedelta(seconds=epoch_time)
        else:
            if epoch_time >= 0 and epoch_time <= 2145916800:
                data = datetime.utcfromtimestamp(epoch_time)
            else:
                if epoch_time < -1262304000:
                    data = datetime(1970, 1, 1) + timedelta(milliseconds=epoch_time)
                else:
                    if epoch_time > 2145916800:
                        data = datetime.utcfromtimestamp(epoch_time / 1000)

        data = f"{data:%Y-%m-%d}"        
        return data


Friday, June 8, 2018

Python - Paying attention when you want to copy/clone a list

if you do b = a,

you didn’t copy the list referenced by a. You just created a new tag and attached it to the list pointed by a. Like in the picture below:
a and b reference the same list
If you modify a, you also modify b, since they point to the same list.

Now we want to copy the list referenced by a. We need to create a new list to attach b to it.
a and b reference different lists

b = a[:]
b = a.copy()

Python -Start scraping - 7. extract data from DBF file without download file

..........

zipfile = ZipFile(io.BytesIO(body))

filenames = [y for y in sorted(zipfile.namelist()) for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
dbf, prj, shp, shx = [io.BytesIO(zipfile.read(filename)) for filename in filenames]

rshp = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)
fields = rshp.fields[1:]
field_names = [field[0] for field in fields]

data_1 = []
data_2 = []
for r in rshp.shapeRecords():
    r = dict(zip(field_names, r.record))
............

Python -Start scraping - 6. pandas unpivot data

from ftplib import FTP
import zipfile
import csv
import pandas as pd

filename = 'XXXX.zip'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip filewith zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()

# read csv filewith open("XXXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_14')
        data[i]['operator_code'] = data[i].pop('Operator_Number')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['county_code'] = data[i].pop('API_County')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Rng')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['meridian'] = data[i].pop('PM')
        data[i]['surface_latitude'] = data[i].pop('Lat_Y')
        data[i]['surface_longitude'] = data[i].pop('Long_X')
        data[i]['otc_county'] = data[i].pop('OTC_County')
        data[i]['otc_pun'] = data[i].pop('OTC_PUN')
        data[i]['reservoir_code'] = data[i].pop('Formation_Code')
        data[i]['reservoir'] = data[i].pop('Formation_Name')
        data[i]['production_year'] = data[i].pop('Reporting_Year')
        data[i]['01-01'] = data[i].pop('January_Manuf')
        data[i]['02-01'] =  data[i].pop('February_Manuf')
        data[i]['03-01'] = data[i].pop('March_Manuf')
        data[i]['04-01'] = data[i].pop('April_Manuf')
        data[i]['05-01'] = data[i].pop('May_Manuf')
        data[i]['06-01'] = data[i].pop('June_Manuf')
        data[i]['07-01'] = data[i].pop('July_Manuf')
        data[i]['08-01'] = data[i].pop('August_Manuf')
        data[i]['09-01'] =  data[i].pop('September_Manuf')
        data[i]['10-01'] = data[i].pop('October_Manuf')
        data[i]['11-01'] = data[i].pop('November_Manuf')
        data[i]['12-01'] = data[i].pop('December_Manuf')
        data[i].pop('API_Base')
        data[i].pop('Reporter_Number')
        data[i].pop('Reporter_Name')
        data[i].pop('Q4')
        data[i].pop('Q3')
        data[i].pop('Q2')
        data[i].pop('Q1')

    data_prod = []

    for i in range(0,len(data)):
        # create dataframe        df = pd.DataFrame(dict(data[i]),index = [0])

        df_unpivot = pd.melt(df, id_vars=['api', 'operator_code','operator', 'well_name',
                             'well_number','county_code', 'township', 'range',
                             'section', 'meridian','surface_latitude',
                             'surface_longitude','otc_county', 'otc_pun','reservoir_code',
                             'reservoir', 'production_year'],
                             var_name='production_month', value_name='gas')
        # upivot dataframe to list        data_prod = df_unpivot.to_dict(orient='records')

        # insert key and value to list element(dict)        for i in range(0,len(data_prod)):
            data_prod[i]['gas_unit'] = 'Mcf'            data_prod[i]['production_date'] = data_prod[i]['production_year']+'-' + data_prod[i]['production_month']
            data_prod[i]['surface_datum'] = 'NAD 83'            data_prod[i]['comment'] = 'assumed surface datum'

            data_prod[i].pop('production_year')
            data_prod[i].pop('production_month')
        print(data_prod)

Python -Start scraping - 5. connect to ftp website

from ftplib import FTP
import zipfile
import csv

filename = 'XXX.ZIP'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip file    with zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()
    
# read csv filewith open("XXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_Number')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['operator_code'] = data[i].pop('Operator')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_status'] = data[i].pop('Well_Status')
        data[i]['well_type'] = data[i].pop('Well_Type')
        data[i]['county_code'] = data[i].pop('County')
        data[i]['location_datum'] = data[i].pop('Loc_Type')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Range')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['principle_meridian'] = data[i].pop('PM')
        data[i]['latitude'] = data[i].pop('Lat_Y')
        data[i]['longitude'] = data[i].pop('Long_X')
        data[i]['measured_depth'] = data[i].pop('Measured_TotalDepth')
        data[i]['vertical_depth'] =  data[i].pop('TrueVertical_Depth')
        data[i]['spud_date'] = data[i].pop('Spud')

Python -Start scraping - 4. connect to HTTPS website

import http.client
from bs4 import BeautifulSoup
import re
import io
import zipfile
import csv

# Find substring within a stringdef find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""    host = 'XXXX'path_orig = '/XXXX/XXXX/XXXX'
conn = http.client.HTTPSConnection(host)

# get the latest file pathconn.request('GET',path_orig)
r = conn.getresponse()

soup = BeautifulSoup(r,'html.parser')
links = soup.findAll('a')

for link in links:
    prod_link = link.get('href')

    match = re.search(r'pub/Database/Production', str(prod_link))
    if match:
        path = prod_link
        file_name = find_between(prod_link,'/pub/Database/','zip') + 'csv'
# Getconn.request('GET', path)   
r = conn.getresponse()
 
# read zip filewith zipfile.ZipFile(io.BytesIO(r.read()), "r") as zf:
    zf.extractall()
    
with open(file_name) as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []  
    
    for r in reader:
        data.append(r)
        
    for i in range(0,len(data)):
        data[i]['production_date'] = data[i].pop('ReportPeriod')
        data[i]['operator_current'] = data[i].pop('Operator')
        data[i]['api'] = data[i].pop('API')
        data[i]['wellbore'] = data[i].pop('WellBore')
        data[i]['reservoir'] = data[i].pop('FormationName').strip()
        data[i]['well_type'] = data[i].pop('WellType')
        data[i]['production_days'] = data[i].pop('DaysProd')
        data[i]['oil'] = data[i].pop('Oil')
        data[i]['gas'] = data[i].pop('Gas')
        data[i]['water'] = data[i].pop('Water')      
        data[i].pop('IsAmended')
        data[i].pop('Oper_No')
        data[i].pop('Old_Oper_No')
        data[i].pop('Entity')
        data[i].pop('WellStatus')
    
   # print(data)