Tuesday, March 31, 2020

Python - scraping website with redirect links

This website (https://portalweb.cammesa.com/memnet1/Pages/descargas.aspx) has redirect link, and the post information is hidden in the response. you have to use the information to do the post, then you can get to the next step.

e.g.
from bs4 import BeautifulSoup
import requests
import zipfile


def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""

headers = {
    'Content-Type': 'xxxxxxxxxxxx',    
    'User-Agent': 'xxxxxxxxxxx'}

session = requests.Session()

start_link = ''text = session.get(start_link, headers=headers).text
soup = BeautifulSoup(text, 'html.parser')

links = soup.find_all('a')

for link in links:
    if 'informe mensual' in str(link).lower():
        c_link = link.get('href')
        monthly_report_link_redirect = 'xxxxxxxx' + str(c_link)

        resp_informe_mensual = session.get(monthly_report_link_redirect, headers=headers)
        soup2_informe_mensual = BeautifulSoup(resp_informe_mensual.text, 'html.parser')

        # redirect to login page
        login_link = 'xxxxxxxxxxxxx'        
        data = {
            'Username': 'xxxxxx',            
            'Password': 'xxxxxx',            
            'RedirectTo': 'xxxxxxx',            
            'Remote_Addr': 'xxxxxxx'        
        }
        resp_open_frame_set = session.post(login_link, headers=headers, data=data)
        soup_open_frame_set = BeautifulSoup(resp_open_frame_set.text, 'html.parser')

        # final data page        data_link = 'xxxxxxxxxxxxx'
        resp_open_page = session.get(data_link, headers=headers)
        soup_open_page = BeautifulSoup(resp_open_page.text, 'html.parser')

        zip_link = soup_open_page.find_all('a')[2].get('href')
        full_link = 'xxxxxxx' + find_between(str(zip_link), '=/', 'zip') + 'zip'        
        file_name = find_between(str(zip_link), '$File/', 'zip') + 'zip'        
        print(full_link)
        print(file_name)

        # Download XXXX.zip file        
        r = requests.get(full_link,  auth=('xxxxxx', 'xxxxx'))
        with open(file_name, "wb") as code:
            code.write(r.content)

        # Unzip files        
        with zipfile.ZipFile(file_name, "r") as zip_ref:
            zip_ref.extractall()

No comments:

Post a Comment