Nan's Blog: April 2018

Friday, April 20, 2018

Python -Start scraping - 2. website data scrape using bs4 and post request

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 9 11:51:03 2018

"""
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Headers; both required!
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}
#-----------------------------------------------------------------------------------------------------------
# Temp parameters: These will be passed in later
#-----------------------------------------------------------------------------------------------------------
# Set up request criteria
data = urlencode({
""" put parameters here
"""
})

# Start session and "set query context" by first calling builder page. This call will acquire session cookie.
session = requests.Session()

try:
# Post query that includes request criteria by calling submit page
text = session.post('http://www.xxxxxxxx', headers=headers, data=data).text

# Loop through each record
soup = BeautifulSoup(text, 'html.parser')

class_type = ['Item','Alternating']

for c_type in class_type:
data_list_items = soup.find_all(class_= c_type)

for data_item in data_list_items:
columns = permit_item.find_all("td")

data1 = columns[0].contents[0].replace('\xa0', ' ')
data2 = columns[1].contents[0].replace('\xa0', ' ')
data3 = columns[2].contents[0].replace('\xa0', ' ')


# reformat permit date to YYYY-mm-dd
data1 = datetime.strptime(data1, '%m/%d/%Y').strftime('%Y-%m-%d')

# Build document
document = {
'data1': data1,
'data2': data1,
'data3': data3,

}

print(document)

# Raise Exception
except:
raise Exception('No data found.')

Python -Start scraping - 1. Zip file download

"""
Created on Tue Mar 20 09:23:10 2018
"""

import requests
import zipfile

url = 'https://www.xxxxxxx.zip'
username = 'xxxx'
password = 'xxxx'
file_name = 'xxxxxx.zip'

# Download wellIndex.zip file
print("downloading with requests")
r = requests.get(url, auth=(username, password))
with open(file_name, "wb") as code:
code.write(r.content)

# Unzip files
with zipfile.ZipFile(file_name,"r") as zip_ref:
zip_ref.extractall()

Nan's Blog

Friday, April 20, 2018

Python -Start scraping - 2. website data scrape using bs4 and post request

Python -Start scraping - 1. Zip file download

Labels

Blog Archive