Friday, April 20, 2018

Python -Start scraping - 2. website data scrape using bs4 and post request

# -*- coding: utf-8 -*-
"""
Created on Mon Apr  9 11:51:03 2018

"""
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Headers; both required!
headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}
#-----------------------------------------------------------------------------------------------------------
# Temp parameters: These will be passed in later
#-----------------------------------------------------------------------------------------------------------
# Set up request criteria
data = urlencode({
        """        put parameters here 
        """
})

# Start session and "set query context" by first calling builder page.  This call will acquire session cookie.
session = requests.Session()

try:
    # Post query that includes request criteria by calling submit page
    text = session.post('http://www.xxxxxxxx', headers=headers, data=data).text

    # Loop through each record
    soup = BeautifulSoup(text, 'html.parser')
 
    class_type = ['Item','Alternating']

    for c_type in class_type:
        data_list_items = soup.find_all(class_= c_type)
 
        for data_item in data_list_items:
            columns = permit_item.find_all("td")
     
            data1 = columns[0].contents[0].replace('\xa0', ' ')
            data2 = columns[1].contents[0].replace('\xa0', ' ')
            data3 = columns[2].contents[0].replace('\xa0', ' ')

        
            # reformat permit date to YYYY-mm-dd
            data1 = datetime.strptime(data1, '%m/%d/%Y').strftime('%Y-%m-%d')
     
            # Build document
            document = {
                'data1': data1,
                'data2': data1,
                'data3': data3,
             
                }
       
            print(document)
            
# Raise Exception  
except:
    raise Exception('No data found.')

Python -Start scraping - 1. Zip file download

"""
Created on Tue Mar 20 09:23:10 2018
"""

import  requests
import zipfile

url = 'https://www.xxxxxxx.zip'
username = 'xxxx'
password = 'xxxx'
file_name = 'xxxxxx.zip'


# Download wellIndex.zip file
print("downloading with requests")
r = requests.get(url, auth=(username, password))
with open(file_name, "wb") as code:
    code.write(r.content)
    
# Unzip files
with zipfile.ZipFile(file_name,"r") as zip_ref:
    zip_ref.extractall()