Nan's Blog

Monday, July 16, 2018

Python - sorted(list) vs. list.sort()

sorted() returns a new sorted list, leaving the original list unaffected. list.sort() sorts the list in-place, mutating the list indices, and returns None (like all in-place operations).

sorted() works on any iterable, not just lists. Strings, tuples, dictionaries (you'll get the keys), generators, etc., returning a list containing all elements, sorted.

Use list.sort() when you want to mutate the list, sorted() when you want a new sorted object back. Use sorted() when you want to sort something that is an iterable, not a list yet.
For lists, list.sort() is faster than sorted() because it doesn't have to create a copy. For any other iterable, you have no choice.
You cannot retrieve the original positions. Once you called list.sort() the original order is gone.

Python - some useful small functions

# find substring within a string
def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""

# find index of certain charactor
def find(str, ch):
    for i, ltr in enumerate(str):
        if ltr == ch:
            yield i

# reformat epoch_time to YYYY-mm-dd
def format_date(data):
    if data == None or data == 'null' or find_between(data, "(", ")") == '':
        return ''    
    else:
        epoch_time = int(find_between(data,"(",")"))
        if epoch_time < 0 and epoch_time >= -1262304000:
            data = datetime(1970, 1, 1) + timedelta(seconds=epoch_time)
        else:
            if epoch_time >= 0 and epoch_time <= 2145916800:
                data = datetime.utcfromtimestamp(epoch_time)
            else:
                if epoch_time < -1262304000:
                    data = datetime(1970, 1, 1) + timedelta(milliseconds=epoch_time)
                else:
                    if epoch_time > 2145916800:
                        data = datetime.utcfromtimestamp(epoch_time / 1000)

        data = f"{data:%Y-%m-%d}"        
        return data

Friday, June 8, 2018

Python - Paying attention when you want to copy/clone a list

if you do b = a,

you didn’t copy the list referenced by a. You just created a new tag b and attached it to the list pointed by a. Like in the picture below:

If you modify a, you also modify b, since they point to the same list.

Now we want to copy the list referenced by a. We need to create a new list to attach b to it.

b = a[:]

b = a.copy()

Python -Start scraping - 7. extract data from DBF file without download file

..........

zipfile = ZipFile(io.BytesIO(body))

filenames = [y for y in sorted(zipfile.namelist()) for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
dbf, prj, shp, shx = [io.BytesIO(zipfile.read(filename)) for filename in filenames]

rshp = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)
fields = rshp.fields[1:]
field_names = [field[0] for field in fields]

data_1 = []
data_2 = []
for r in rshp.shapeRecords():
    r = dict(zip(field_names, r.record))

............

Python -Start scraping - 6. pandas unpivot data

from ftplib import FTP
import zipfile
import csv
import pandas as pd

filename = 'XXXX.zip'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip filewith zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()

# read csv filewith open("XXXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_14')
        data[i]['operator_code'] = data[i].pop('Operator_Number')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['county_code'] = data[i].pop('API_County')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Rng')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['meridian'] = data[i].pop('PM')
        data[i]['surface_latitude'] = data[i].pop('Lat_Y')
        data[i]['surface_longitude'] = data[i].pop('Long_X')
        data[i]['otc_county'] = data[i].pop('OTC_County')
        data[i]['otc_pun'] = data[i].pop('OTC_PUN')
        data[i]['reservoir_code'] = data[i].pop('Formation_Code')
        data[i]['reservoir'] = data[i].pop('Formation_Name')
        data[i]['production_year'] = data[i].pop('Reporting_Year')
        data[i]['01-01'] = data[i].pop('January_Manuf')
        data[i]['02-01'] =  data[i].pop('February_Manuf')
        data[i]['03-01'] = data[i].pop('March_Manuf')
        data[i]['04-01'] = data[i].pop('April_Manuf')
        data[i]['05-01'] = data[i].pop('May_Manuf')
        data[i]['06-01'] = data[i].pop('June_Manuf')
        data[i]['07-01'] = data[i].pop('July_Manuf')
        data[i]['08-01'] = data[i].pop('August_Manuf')
        data[i]['09-01'] =  data[i].pop('September_Manuf')
        data[i]['10-01'] = data[i].pop('October_Manuf')
        data[i]['11-01'] = data[i].pop('November_Manuf')
        data[i]['12-01'] = data[i].pop('December_Manuf')
        data[i].pop('API_Base')
        data[i].pop('Reporter_Number')
        data[i].pop('Reporter_Name')
        data[i].pop('Q4')
        data[i].pop('Q3')
        data[i].pop('Q2')
        data[i].pop('Q1')

    data_prod = []

    for i in range(0,len(data)):
        # create dataframe        df = pd.DataFrame(dict(data[i]),index = [0])

        df_unpivot = pd.melt(df, id_vars=['api', 'operator_code','operator', 'well_name',
                             'well_number','county_code', 'township', 'range',
                             'section', 'meridian','surface_latitude',
                             'surface_longitude','otc_county', 'otc_pun','reservoir_code',
                             'reservoir', 'production_year'],
                             var_name='production_month', value_name='gas')
        # upivot dataframe to list        data_prod = df_unpivot.to_dict(orient='records')

        # insert key and value to list element(dict)        for i in range(0,len(data_prod)):
            data_prod[i]['gas_unit'] = 'Mcf'            data_prod[i]['production_date'] = data_prod[i]['production_year']+'-' + data_prod[i]['production_month']
            data_prod[i]['surface_datum'] = 'NAD 83'            data_prod[i]['comment'] = 'assumed surface datum'

            data_prod[i].pop('production_year')
            data_prod[i].pop('production_month')
        print(data_prod)

Python -Start scraping - 5. connect to ftp website

from ftplib import FTP
import zipfile
import csv

filename = 'XXX.ZIP'
ftp = FTP('ftp.XXXX.com')
ftp.login()
ftp.cwd("/XXX")
gFile = open(filename, "wb")
ftp.retrbinary('RETR XXX.zip', gFile.write)
gFile.close()
ftp.quit()

# Unzip file    with zipfile.ZipFile(filename,"r") as zip_ref:
    zip_ref.extractall()
    
# read csv filewith open("XXX.csv") as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []
    for r in reader:
        data.append(r)
    for i in range(0,len(data)):
        data[i]['api'] = data[i].pop('API_Number')
        data[i]['well_name'] = data[i].pop('Well_Name')
        data[i]['well_number'] = data[i].pop('Well_Number')
        data[i]['operator_code'] = data[i].pop('Operator')
        data[i]['operator'] = data[i].pop('Operator_Name')
        data[i]['well_status'] = data[i].pop('Well_Status')
        data[i]['well_type'] = data[i].pop('Well_Type')
        data[i]['county_code'] = data[i].pop('County')
        data[i]['location_datum'] = data[i].pop('Loc_Type')
        data[i]['township'] = data[i].pop('Twp')
        data[i]['range'] = data[i].pop('Range')
        data[i]['section'] = data[i].pop('Sec')
        data[i]['principle_meridian'] = data[i].pop('PM')
        data[i]['latitude'] = data[i].pop('Lat_Y')
        data[i]['longitude'] = data[i].pop('Long_X')
        data[i]['measured_depth'] = data[i].pop('Measured_TotalDepth')
        data[i]['vertical_depth'] =  data[i].pop('TrueVertical_Depth')
        data[i]['spud_date'] = data[i].pop('Spud')

Python -Start scraping - 4. connect to HTTPS website

import http.client
from bs4 import BeautifulSoup
import re
import io
import zipfile
import csv

# Find substring within a stringdef find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""    host = 'XXXX'path_orig = '/XXXX/XXXX/XXXX'
conn = http.client.HTTPSConnection(host)

# get the latest file pathconn.request('GET',path_orig)
r = conn.getresponse()

soup = BeautifulSoup(r,'html.parser')
links = soup.findAll('a')

for link in links:
    prod_link = link.get('href')

    match = re.search(r'pub/Database/Production', str(prod_link))
    if match:
        path = prod_link
        file_name = find_between(prod_link,'/pub/Database/','zip') + 'csv'
# Getconn.request('GET', path)   
r = conn.getresponse()
 
# read zip filewith zipfile.ZipFile(io.BytesIO(r.read()), "r") as zf:
    zf.extractall()
    
with open(file_name) as f:
    reader = csv.DictReader(f)
    next(reader)
    data = []  
    
    for r in reader:
        data.append(r)
        
    for i in range(0,len(data)):
        data[i]['production_date'] = data[i].pop('ReportPeriod')
        data[i]['operator_current'] = data[i].pop('Operator')
        data[i]['api'] = data[i].pop('API')
        data[i]['wellbore'] = data[i].pop('WellBore')
        data[i]['reservoir'] = data[i].pop('FormationName').strip()
        data[i]['well_type'] = data[i].pop('WellType')
        data[i]['production_days'] = data[i].pop('DaysProd')
        data[i]['oil'] = data[i].pop('Oil')
        data[i]['gas'] = data[i].pop('Gas')
        data[i]['water'] = data[i].pop('Water')      
        data[i].pop('IsAmended')
        data[i].pop('Oper_No')
        data[i].pop('Old_Oper_No')
        data[i].pop('Entity')
        data[i].pop('WellStatus')
    
   # print(data)