266 lines
11 KiB
Python
Executable file
266 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Download sent FlickrMail messages for backup."""
|
|
|
|
import time
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from flickr_mail.database import init_db, get_session
|
|
from flickr_mail.models import SentMessage
|
|
from flickr_mail.url_utils import (
|
|
creator_profile_from_flickr_url,
|
|
extract_urls_from_message,
|
|
normalize_flickr_url,
|
|
)
|
|
|
|
BASE_URL = "https://www.flickr.com"
|
|
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
|
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
|
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-GB,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "same-origin",
|
|
"Sec-Fetch-User": "?1",
|
|
"Priority": "u=0, i",
|
|
}
|
|
|
|
COOKIES_STR = """ccc=%7B%22needsConsent%22%3Atrue%2C%22managed%22%3A0%2C%22changed%22%3A0%2C%22info%22%3A%7B%22cookieBlock%22%3A%7B%22level%22%3A2%2C%22blockRan%22%3A1%7D%7D%7D; _sp_ses.df80=*; _sp_id.df80=968931de-089d-4576-b729-6662c2c13a65.1770187027.1.1770187129..adf2374b-b85c-4899-afb7-63c2203d0c44..9422de57-9cdf-49c9-ac54-183eaa1ec457.1770187027101.24; TAsessionID=7f373c97-e9f8-46cb-bc1a-cb4f164ce46b|NEW; notice_behavior=expressed,eu; usprivacy=1---; acstring=3~550.1942.3126.3005.3077.1329.196.1725.1092; euconsent-v2=CQfGXgAQfGXgAAvACDENCQFsAP_gAEPgAAAALktB9G5cSSFBYCJVYbtEYAQDwFhg4oAhAgABEwAATBoAoIwGBGAoIAiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgAAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAAABIAAAAAAAAAAAAAAAAAAABAAAAICBAAAAQAAAAAAAAAAAAAAAAAAAAgqY0H0blxJIUFgIFVhu0QgBBPAWADigCEAAAEDAABMGgCgjAIUYCAgSIAgIAAAAAAgBEAgAQAIAABAAAAAgAEAAAwAQAAgAAAAAAAAAAECAAAAQAgACCBYAAAAQEQAVACBAABAEgAIUAAAAAEBQAEAAAAAAIABAAAAICBAAAAAAAAAAACQCEAAAAAAAAAAEAwBAAEgAAAAAAAAAAAAAAAAAAAEABAAgIEAAABAA.YAAAAAAAAAAA.ILktB9G5cSSFBYCJVYbtEYAQTwFhg4oAhAgABEwAATBoAoIwGFGAoIEiAICACAAAAIARAIAEECAAAQAAAIIABAAAMAEAAIAACIAAACAABAgAACEAIAAggWAAAAEBEAFQAgQAAQBIACFAAAgABAUABAAAAAACAAQAAACAgQAAAAAAAAAAAkAhAAAAAAAAAABAMAQABIAAAAAAAAAAAAAAAAAAABAAQAICBAAAAQAAAAAAAAAAAAAAAAAAAAgA; notice_preferences=2:; notice_gdpr_prefs=0,1,2:; cmapi_gtm_bl=; cmapi_cookie_privacy=permit 1,2,3; AMCV_48E815355BFE96970A495CD0%40AdobeOrg=281789898%7CMCMID%7C44859851125632937290373504988866174366%7CMCOPTOUT-1770194232s%7CNONE%7CvVersion%7C4.1.0; AMCVS_48E815355BFE96970A495CD0%40AdobeOrg=1; xb=646693; localization=en-us%3Buk%3Bgb; flrbp=1770187037-cfbf3914859af9ef68992c8389162e65e81c86c4; flrbgrp=1770187037-8e700fa7d73b4f2d43550f40513e7c6f507fd20f; flrbgdrp=1770187037-9af21cc74000b5f3f0943243608b4284d5f60ffd; flrbgmrp=1770187037-53f7bfff110731954be6bdfb2f587d59a8305670; flrbrst=1770187037-440e42fcee9b4e8e81ba8bc3eb3d0fc8b62e7083; flrtags=1770187037-7b50035cb956b9216a2f3372f498f7008d8e26a8; flrbrp=1770187037-c0195dc99caa020d4e32b39556131add862f26a0; flrb=34; session_id=2693fb01-87a0-42b1-a426-74642807b534; cookie_session=834645%3A29f2a9722d8bac88553ea1baf7ea11b4; cookie_accid=834645; cookie_epass=29f2a9722d8bac88553ea1baf7ea11b4; sa=1775371036%3A79962317%40N00%3A8fb60f4760b4840f37af3ebc90a8cb57; vp=2075%2C1177%2C1%2C0; flrbfd=1770187037-88a4e436729c9c5551794483fbd9c80e9dac2354; flrbpap=1770187037-18adaacf3a389df4a7bdc05cd471e492c54ef841; liqpw=2075; liqph=672"""
|
|
|
|
|
|
def parse_cookies(cookie_str: str) -> dict[str, str]:
|
|
"""Parse cookie string into dictionary."""
|
|
cookies = {}
|
|
for item in cookie_str.split("; "):
|
|
if "=" in item:
|
|
key, value = item.split("=", 1)
|
|
cookies[key] = value
|
|
return cookies
|
|
|
|
|
|
def create_session() -> requests.Session:
|
|
"""Create a requests session with authentication."""
|
|
session = requests.Session()
|
|
session.headers.update(HEADERS)
|
|
session.cookies.update(parse_cookies(COOKIES_STR))
|
|
return session
|
|
|
|
|
|
def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
|
|
"""Fetch a page and return parsed HTML."""
|
|
response = session.get(url)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
|
|
"""Extract message metadata from a sent mail list page."""
|
|
messages = []
|
|
|
|
# Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
|
|
mail_rows = soup.select("tr.message_row")
|
|
|
|
for row in mail_rows:
|
|
msg = {}
|
|
|
|
# Get message ID from the row id attribute
|
|
row_id = row.get("id", "")
|
|
if row_id.startswith("message_row_"):
|
|
msg["message_id"] = row_id.replace("message_row_", "")
|
|
|
|
# Find message link in the subject cell
|
|
subj_cell = row.select_one("td.subj")
|
|
if subj_cell:
|
|
link = subj_cell.find("a")
|
|
if link:
|
|
msg["subject"] = link.get_text(strip=True)
|
|
msg["url"] = BASE_URL + link["href"]
|
|
|
|
# Recipient is in td.fromto
|
|
fromto_cell = row.select_one("td.fromto")
|
|
if fromto_cell:
|
|
msg["recipient"] = fromto_cell.get_text(strip=True)
|
|
|
|
# Date is in td.date
|
|
date_cell = row.select_one("td.date")
|
|
if date_cell:
|
|
msg["date"] = date_cell.get_text(strip=True)
|
|
|
|
if "message_id" in msg:
|
|
messages.append(msg)
|
|
|
|
return messages
|
|
|
|
|
|
def extract_message_content(soup: BeautifulSoup) -> dict:
|
|
"""Extract full message content from a message page."""
|
|
content = {}
|
|
|
|
# Find the ThinCase div containing the message
|
|
thin_case = soup.select_one(".ThinCase")
|
|
if not thin_case:
|
|
return content
|
|
|
|
# Find the table with message content
|
|
table = thin_case.find("table")
|
|
if not table:
|
|
return content
|
|
|
|
rows = table.find_all("tr", recursive=False)
|
|
|
|
# Row 0: To: <recipient>
|
|
# Row 1: Subject: <subject>
|
|
# Row 2: <empty> <body>
|
|
for row in rows:
|
|
cells = row.find_all("td", recursive=False)
|
|
if len(cells) >= 2:
|
|
header_cell = cells[0]
|
|
value_cell = cells[1]
|
|
|
|
header = header_cell.get_text(strip=True).lower()
|
|
|
|
if header == "to:":
|
|
# Get recipient username
|
|
username = value_cell.select_one(".username")
|
|
if username:
|
|
content["recipient"] = username.get_text(strip=True)
|
|
|
|
elif header == "subject:":
|
|
# Get subject from h3
|
|
h3 = value_cell.find("h3")
|
|
if h3:
|
|
content["subject"] = h3.get_text(strip=True)
|
|
|
|
elif header == "":
|
|
# This is the message body row (empty header cell)
|
|
# Get the content but exclude the delete form
|
|
form = value_cell.find("form")
|
|
if form:
|
|
form.decompose()
|
|
|
|
content["body"] = value_cell.get_text(separator="\n", strip=True)
|
|
content["body_html"] = str(value_cell)
|
|
break # Body found, stop processing
|
|
|
|
return content
|
|
|
|
|
|
def main() -> None:
|
|
"""Main entry point."""
|
|
init_db()
|
|
db_session = get_session()
|
|
|
|
try:
|
|
existing_ids = {
|
|
r[0] for r in db_session.query(SentMessage.message_id).all()
|
|
}
|
|
print(f"Database has {len(existing_ids)} messages")
|
|
|
|
http_session = create_session()
|
|
|
|
new_messages: list[dict] = []
|
|
stop_fetching = False
|
|
|
|
print("Fetching message list until we reach existing messages...")
|
|
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
|
|
url = SENT_MAIL_URL.format(page=page)
|
|
print(f" Fetching page {page}...")
|
|
|
|
try:
|
|
soup = fetch_page(http_session, url)
|
|
page_messages = extract_messages_from_list_page(soup)
|
|
|
|
if not page_messages:
|
|
print(" No messages found on this page, stopping")
|
|
break
|
|
|
|
page_new_messages = 0
|
|
for msg in page_messages:
|
|
msg_id = msg.get("message_id")
|
|
if not msg_id:
|
|
continue
|
|
if msg_id in existing_ids:
|
|
stop_fetching = True
|
|
break
|
|
|
|
new_messages.append(msg)
|
|
page_new_messages += 1
|
|
|
|
if stop_fetching:
|
|
print(" Reached messages already in the database, stopping pagination")
|
|
break
|
|
|
|
if page_new_messages == 0:
|
|
print(" No new messages on this page, stopping pagination")
|
|
break
|
|
|
|
time.sleep(1) # Be polite to the server
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching page {page}: {e}")
|
|
continue
|
|
|
|
print(f"Found {len(new_messages)} new messages to download")
|
|
|
|
# Download individual messages
|
|
for i, msg in enumerate(new_messages, 1):
|
|
msg_id = msg["message_id"]
|
|
url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
|
|
|
|
print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
|
|
|
|
try:
|
|
soup = fetch_page(http_session, url)
|
|
content = extract_message_content(soup)
|
|
|
|
# Merge with metadata
|
|
full_msg = {**msg, **content}
|
|
|
|
body = full_msg.get("body", "")
|
|
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
|
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
|
creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
|
|
|
db_session.add(SentMessage(
|
|
message_id=msg_id,
|
|
subject=full_msg.get("subject", ""),
|
|
url=full_msg.get("url", ""),
|
|
recipient=full_msg.get("recipient", ""),
|
|
date=full_msg.get("date", ""),
|
|
body=body,
|
|
body_html=full_msg.get("body_html", ""),
|
|
flickr_url=flickr_url,
|
|
normalized_flickr_url=normalized,
|
|
wikipedia_url=wikipedia_url,
|
|
creator_profile_url=creator_profile,
|
|
))
|
|
db_session.commit()
|
|
|
|
time.sleep(1) # Be polite
|
|
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
print(f" Error downloading message {msg_id}: {e}")
|
|
continue
|
|
|
|
total = db_session.query(SentMessage).count()
|
|
print(f"Done! {total} messages in database")
|
|
|
|
except Exception:
|
|
db_session.rollback()
|
|
raise
|
|
finally:
|
|
db_session.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|