Nan's Blog: March 2020

Tuesday, March 31, 2020

Python - usaging of "init.py"

create __init__.py file to get all files in the folder.

from os.path import dirname, basename, isfile, join
import glob

modules = glob.glob(join(dirname(__file__), "*.py"))
__all__ = [basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]

Python - scraping website with redirect links

This website (https://portalweb.cammesa.com/memnet1/Pages/descargas.aspx) has redirect link, and the post information is hidden in the response. you have to use the information to do the post, then you can get to the next step.

e.g.

from bs4 import BeautifulSoup
import requests
import zipfile


def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""

headers = {
    'Content-Type': 'xxxxxxxxxxxx',

    'User-Agent': 'xxxxxxxxxxx'}

session = requests.Session()

start_link = ''text = session.get(start_link, headers=headers).text
soup = BeautifulSoup(text, 'html.parser')

links = soup.find_all('a')

for link in links:
    if 'informe mensual' in str(link).lower():
        c_link = link.get('href')
        monthly_report_link_redirect = 'xxxxxxxx' + str(c_link)

        resp_informe_mensual = session.get(monthly_report_link_redirect, headers=headers)
        soup2_informe_mensual = BeautifulSoup(resp_informe_mensual.text, 'html.parser')

        # redirect to login page

        login_link = 'xxxxxxxxxxxxx'

        data = {
            'Username': 'xxxxxx',

            'Password': 'xxxxxx',

            'RedirectTo': 'xxxxxxx',

            'Remote_Addr': 'xxxxxxx'

        }
        resp_open_frame_set = session.post(login_link, headers=headers, data=data)
        soup_open_frame_set = BeautifulSoup(resp_open_frame_set.text, 'html.parser')

        # final data page        data_link = 'xxxxxxxxxxxxx'

        resp_open_page = session.get(data_link, headers=headers)
        soup_open_page = BeautifulSoup(resp_open_page.text, 'html.parser')

        zip_link = soup_open_page.find_all('a')[2].get('href')
        full_link = 'xxxxxxx' + find_between(str(zip_link), '=/', 'zip') + 'zip'

        file_name = find_between(str(zip_link), '$File/', 'zip') + 'zip'

        print(full_link)
        print(file_name)

        # Download XXXX.zip file

        r = requests.get(full_link,  auth=('xxxxxx', 'xxxxx'))
        with open(file_name, "wb") as code:
            code.write(r.content)

        # Unzip files

        with zipfile.ZipFile(file_name, "r") as zip_ref:
            zip_ref.extractall()

Nan's Blog

Tuesday, March 31, 2020

Python - usaging of "init.py"

Python - scraping website with redirect links

Labels

Blog Archive

Tuesday, March 31, 2020

Python - usaging of "__init__.py"

Python - scraping website with redirect links

Python - usaging of "init.py"