This website (
https://portalweb.cammesa.com/memnet1/Pages/descargas.aspx) has redirect link, and the post information is hidden in the response. you have to use the information to do the post, then you can get to the next step.
e.g.
from bs4 import BeautifulSoup
import requests
import zipfile
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
headers = {
'Content-Type': 'xxxxxxxxxxxx',
'User-Agent': 'xxxxxxxxxxx'}
session = requests.Session()
start_link = ''text = session.get(start_link, headers=headers).text
soup = BeautifulSoup(text, 'html.parser')
links = soup.find_all('a')
for link in links:
if 'informe mensual' in str(link).lower():
c_link = link.get('href')
monthly_report_link_redirect = 'xxxxxxxx' + str(c_link)
resp_informe_mensual = session.get(monthly_report_link_redirect, headers=headers)
soup2_informe_mensual = BeautifulSoup(resp_informe_mensual.text, 'html.parser')
# redirect to login page
login_link = 'xxxxxxxxxxxxx'
data = {
'Username': 'xxxxxx',
'Password': 'xxxxxx',
'RedirectTo': 'xxxxxxx',
'Remote_Addr': 'xxxxxxx'
}
resp_open_frame_set = session.post(login_link, headers=headers, data=data)
soup_open_frame_set = BeautifulSoup(resp_open_frame_set.text, 'html.parser')
# final data page data_link = 'xxxxxxxxxxxxx'
resp_open_page = session.get(data_link, headers=headers)
soup_open_page = BeautifulSoup(resp_open_page.text, 'html.parser')
zip_link = soup_open_page.find_all('a')[2].get('href')
full_link = 'xxxxxxx' + find_between(str(zip_link), '=/', 'zip') + 'zip'
file_name = find_between(str(zip_link), '$File/', 'zip') + 'zip'
print(full_link)
print(file_name)
# Download XXXX.zip file
r = requests.get(full_link, auth=('xxxxxx', 'xxxxx'))
with open(file_name, "wb") as code:
code.write(r.content)
# Unzip files
with zipfile.ZipFile(file_name, "r") as zip_ref:
zip_ref.extractall()