flickr-mail/download_sent_mail.py

286 lines
9.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""Download sent FlickrMail messages for backup."""
import json
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from flickr_mail.database import init_db, get_session
from flickr_mail.models import SentMessage
from flickr_mail.url_utils import (
creator_profile_from_flickr_url,
extract_urls_from_message,
normalize_flickr_url,
)
BASE_URL = "https://www.flickr.com"
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
CONFIG_FILE = Path(__file__).with_name("download_sent_mail.local.json")
EXAMPLE_CONFIG_FILE = Path(__file__).with_name("download_sent_mail.example.json")
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br, zstd",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Priority": "u=0, i",
}
def load_cookie_string() -> str:
"""Load Flickr cookies string from local JSON config."""
if not CONFIG_FILE.exists():
raise RuntimeError(
f"Missing config file: {CONFIG_FILE}. "
f"Copy {EXAMPLE_CONFIG_FILE.name} to {CONFIG_FILE.name} and set cookies_str."
)
try:
data = json.loads(CONFIG_FILE.read_text())
except json.JSONDecodeError as exc:
raise RuntimeError(f"Invalid JSON in {CONFIG_FILE}: {exc}") from exc
cookie_str = data.get("cookies_str", "").strip()
if not cookie_str:
raise RuntimeError(f"{CONFIG_FILE} must contain a non-empty 'cookies_str' value")
return cookie_str
def parse_cookies(cookie_str: str) -> dict[str, str]:
"""Parse cookie string into dictionary."""
cookies = {}
for item in cookie_str.split("; "):
if "=" in item:
key, value = item.split("=", 1)
cookies[key] = value
return cookies
def create_session() -> requests.Session:
"""Create a requests session with authentication."""
session = requests.Session()
session.headers.update(HEADERS)
session.cookies.update(parse_cookies(load_cookie_string()))
return session
def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
"""Fetch a page and return parsed HTML."""
response = session.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
"""Extract message metadata from a sent mail list page."""
messages = []
# Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
mail_rows = soup.select("tr.message_row")
for row in mail_rows:
msg = {}
# Get message ID from the row id attribute
row_id = row.get("id", "")
if row_id.startswith("message_row_"):
msg["message_id"] = row_id.replace("message_row_", "")
# Find message link in the subject cell
subj_cell = row.select_one("td.subj")
if subj_cell:
link = subj_cell.find("a")
if link:
msg["subject"] = link.get_text(strip=True)
msg["url"] = BASE_URL + link["href"]
# Recipient is in td.fromto
fromto_cell = row.select_one("td.fromto")
if fromto_cell:
msg["recipient"] = fromto_cell.get_text(strip=True)
# Date is in td.date
date_cell = row.select_one("td.date")
if date_cell:
msg["date"] = date_cell.get_text(strip=True)
if "message_id" in msg:
messages.append(msg)
return messages
def extract_message_content(soup: BeautifulSoup) -> dict:
"""Extract full message content from a message page."""
content = {}
# Find the ThinCase div containing the message
thin_case = soup.select_one(".ThinCase")
if not thin_case:
return content
# Find the table with message content
table = thin_case.find("table")
if not table:
return content
rows = table.find_all("tr", recursive=False)
# Row 0: To: <recipient>
# Row 1: Subject: <subject>
# Row 2: <empty> <body>
for row in rows:
cells = row.find_all("td", recursive=False)
if len(cells) >= 2:
header_cell = cells[0]
value_cell = cells[1]
header = header_cell.get_text(strip=True).lower()
if header == "to:":
# Get recipient username
username = value_cell.select_one(".username")
if username:
content["recipient"] = username.get_text(strip=True)
elif header == "subject:":
# Get subject from h3
h3 = value_cell.find("h3")
if h3:
content["subject"] = h3.get_text(strip=True)
elif header == "":
# This is the message body row (empty header cell)
# Get the content but exclude the delete form
form = value_cell.find("form")
if form:
form.decompose()
content["body"] = value_cell.get_text(separator="\n", strip=True)
content["body_html"] = str(value_cell)
break # Body found, stop processing
return content
def main() -> None:
"""Main entry point."""
init_db()
db_session = get_session()
try:
existing_ids = {
r[0] for r in db_session.query(SentMessage.message_id).all()
}
print(f"Database has {len(existing_ids)} messages")
http_session = create_session()
new_messages: list[dict] = []
stop_fetching = False
print("Fetching message list until we reach existing messages...")
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
url = SENT_MAIL_URL.format(page=page)
print(f" Fetching page {page}...")
try:
soup = fetch_page(http_session, url)
page_messages = extract_messages_from_list_page(soup)
if not page_messages:
print(" No messages found on this page, stopping")
break
page_new_messages = 0
for msg in page_messages:
msg_id = msg.get("message_id")
if not msg_id:
continue
if msg_id in existing_ids:
stop_fetching = True
break
new_messages.append(msg)
page_new_messages += 1
if stop_fetching:
print(" Reached messages already in the database, stopping pagination")
break
if page_new_messages == 0:
print(" No new messages on this page, stopping pagination")
break
time.sleep(1) # Be polite to the server
except Exception as e:
print(f" Error fetching page {page}: {e}")
continue
print(f"Found {len(new_messages)} new messages to download")
# Download individual messages
for i, msg in enumerate(new_messages, 1):
msg_id = msg["message_id"]
url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
try:
soup = fetch_page(http_session, url)
content = extract_message_content(soup)
# Merge with metadata
full_msg = {**msg, **content}
body = full_msg.get("body", "")
flickr_url, wikipedia_url = extract_urls_from_message(body)
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
db_session.add(SentMessage(
message_id=msg_id,
subject=full_msg.get("subject", ""),
url=full_msg.get("url", ""),
recipient=full_msg.get("recipient", ""),
date=full_msg.get("date", ""),
body=body,
body_html=full_msg.get("body_html", ""),
flickr_url=flickr_url,
normalized_flickr_url=normalized,
wikipedia_url=wikipedia_url,
creator_profile_url=creator_profile,
))
db_session.commit()
time.sleep(1) # Be polite
except Exception as e:
db_session.rollback()
print(f" Error downloading message {msg_id}: {e}")
continue
total = db_session.query(SentMessage).count()
print(f"Done! {total} messages in database")
except Exception:
db_session.rollback()
raise
finally:
db_session.close()
if __name__ == "__main__":
main()