286 lines
9.2 KiB
Python
Executable file
286 lines
9.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Download sent FlickrMail messages for backup."""
|
|
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from flickr_mail.database import init_db, get_session
|
|
from flickr_mail.models import SentMessage
|
|
from flickr_mail.url_utils import (
|
|
creator_profile_from_flickr_url,
|
|
extract_urls_from_message,
|
|
normalize_flickr_url,
|
|
)
|
|
|
|
BASE_URL = "https://www.flickr.com"
|
|
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
|
|
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
|
|
MAX_SENT_MAIL_PAGES = 29 # Fallback upper bound if we need to backfill everything
|
|
CONFIG_FILE = Path(__file__).with_name("download_sent_mail.local.json")
|
|
EXAMPLE_CONFIG_FILE = Path(__file__).with_name("download_sent_mail.example.json")
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-GB,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "same-origin",
|
|
"Sec-Fetch-User": "?1",
|
|
"Priority": "u=0, i",
|
|
}
|
|
|
|
def load_cookie_string() -> str:
|
|
"""Load Flickr cookies string from local JSON config."""
|
|
if not CONFIG_FILE.exists():
|
|
raise RuntimeError(
|
|
f"Missing config file: {CONFIG_FILE}. "
|
|
f"Copy {EXAMPLE_CONFIG_FILE.name} to {CONFIG_FILE.name} and set cookies_str."
|
|
)
|
|
|
|
try:
|
|
data = json.loads(CONFIG_FILE.read_text())
|
|
except json.JSONDecodeError as exc:
|
|
raise RuntimeError(f"Invalid JSON in {CONFIG_FILE}: {exc}") from exc
|
|
|
|
cookie_str = data.get("cookies_str", "").strip()
|
|
if not cookie_str:
|
|
raise RuntimeError(f"{CONFIG_FILE} must contain a non-empty 'cookies_str' value")
|
|
return cookie_str
|
|
|
|
|
|
def parse_cookies(cookie_str: str) -> dict[str, str]:
|
|
"""Parse cookie string into dictionary."""
|
|
cookies = {}
|
|
for item in cookie_str.split("; "):
|
|
if "=" in item:
|
|
key, value = item.split("=", 1)
|
|
cookies[key] = value
|
|
return cookies
|
|
|
|
|
|
def create_session() -> requests.Session:
|
|
"""Create a requests session with authentication."""
|
|
session = requests.Session()
|
|
session.headers.update(HEADERS)
|
|
session.cookies.update(parse_cookies(load_cookie_string()))
|
|
return session
|
|
|
|
|
|
def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
|
|
"""Fetch a page and return parsed HTML."""
|
|
response = session.get(url)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
|
|
"""Extract message metadata from a sent mail list page."""
|
|
messages = []
|
|
|
|
# Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
|
|
mail_rows = soup.select("tr.message_row")
|
|
|
|
for row in mail_rows:
|
|
msg = {}
|
|
|
|
# Get message ID from the row id attribute
|
|
row_id = row.get("id", "")
|
|
if row_id.startswith("message_row_"):
|
|
msg["message_id"] = row_id.replace("message_row_", "")
|
|
|
|
# Find message link in the subject cell
|
|
subj_cell = row.select_one("td.subj")
|
|
if subj_cell:
|
|
link = subj_cell.find("a")
|
|
if link:
|
|
msg["subject"] = link.get_text(strip=True)
|
|
msg["url"] = BASE_URL + link["href"]
|
|
|
|
# Recipient is in td.fromto
|
|
fromto_cell = row.select_one("td.fromto")
|
|
if fromto_cell:
|
|
msg["recipient"] = fromto_cell.get_text(strip=True)
|
|
|
|
# Date is in td.date
|
|
date_cell = row.select_one("td.date")
|
|
if date_cell:
|
|
msg["date"] = date_cell.get_text(strip=True)
|
|
|
|
if "message_id" in msg:
|
|
messages.append(msg)
|
|
|
|
return messages
|
|
|
|
|
|
def extract_message_content(soup: BeautifulSoup) -> dict:
|
|
"""Extract full message content from a message page."""
|
|
content = {}
|
|
|
|
# Find the ThinCase div containing the message
|
|
thin_case = soup.select_one(".ThinCase")
|
|
if not thin_case:
|
|
return content
|
|
|
|
# Find the table with message content
|
|
table = thin_case.find("table")
|
|
if not table:
|
|
return content
|
|
|
|
rows = table.find_all("tr", recursive=False)
|
|
|
|
# Row 0: To: <recipient>
|
|
# Row 1: Subject: <subject>
|
|
# Row 2: <empty> <body>
|
|
for row in rows:
|
|
cells = row.find_all("td", recursive=False)
|
|
if len(cells) >= 2:
|
|
header_cell = cells[0]
|
|
value_cell = cells[1]
|
|
|
|
header = header_cell.get_text(strip=True).lower()
|
|
|
|
if header == "to:":
|
|
# Get recipient username
|
|
username = value_cell.select_one(".username")
|
|
if username:
|
|
content["recipient"] = username.get_text(strip=True)
|
|
|
|
elif header == "subject:":
|
|
# Get subject from h3
|
|
h3 = value_cell.find("h3")
|
|
if h3:
|
|
content["subject"] = h3.get_text(strip=True)
|
|
|
|
elif header == "":
|
|
# This is the message body row (empty header cell)
|
|
# Get the content but exclude the delete form
|
|
form = value_cell.find("form")
|
|
if form:
|
|
form.decompose()
|
|
|
|
content["body"] = value_cell.get_text(separator="\n", strip=True)
|
|
content["body_html"] = str(value_cell)
|
|
break # Body found, stop processing
|
|
|
|
return content
|
|
|
|
|
|
def main() -> None:
|
|
"""Main entry point."""
|
|
init_db()
|
|
db_session = get_session()
|
|
|
|
try:
|
|
existing_ids = {
|
|
r[0] for r in db_session.query(SentMessage.message_id).all()
|
|
}
|
|
print(f"Database has {len(existing_ids)} messages")
|
|
|
|
http_session = create_session()
|
|
|
|
new_messages: list[dict] = []
|
|
stop_fetching = False
|
|
|
|
print("Fetching message list until we reach existing messages...")
|
|
for page in range(1, MAX_SENT_MAIL_PAGES + 1):
|
|
url = SENT_MAIL_URL.format(page=page)
|
|
print(f" Fetching page {page}...")
|
|
|
|
try:
|
|
soup = fetch_page(http_session, url)
|
|
page_messages = extract_messages_from_list_page(soup)
|
|
|
|
if not page_messages:
|
|
print(" No messages found on this page, stopping")
|
|
break
|
|
|
|
page_new_messages = 0
|
|
for msg in page_messages:
|
|
msg_id = msg.get("message_id")
|
|
if not msg_id:
|
|
continue
|
|
if msg_id in existing_ids:
|
|
stop_fetching = True
|
|
break
|
|
|
|
new_messages.append(msg)
|
|
page_new_messages += 1
|
|
|
|
if stop_fetching:
|
|
print(" Reached messages already in the database, stopping pagination")
|
|
break
|
|
|
|
if page_new_messages == 0:
|
|
print(" No new messages on this page, stopping pagination")
|
|
break
|
|
|
|
time.sleep(1) # Be polite to the server
|
|
|
|
except Exception as e:
|
|
print(f" Error fetching page {page}: {e}")
|
|
continue
|
|
|
|
print(f"Found {len(new_messages)} new messages to download")
|
|
|
|
# Download individual messages
|
|
for i, msg in enumerate(new_messages, 1):
|
|
msg_id = msg["message_id"]
|
|
url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)
|
|
|
|
print(f" [{i}/{len(new_messages)}] Downloading message {msg_id}...")
|
|
|
|
try:
|
|
soup = fetch_page(http_session, url)
|
|
content = extract_message_content(soup)
|
|
|
|
# Merge with metadata
|
|
full_msg = {**msg, **content}
|
|
|
|
body = full_msg.get("body", "")
|
|
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
|
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
|
creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
|
|
|
db_session.add(SentMessage(
|
|
message_id=msg_id,
|
|
subject=full_msg.get("subject", ""),
|
|
url=full_msg.get("url", ""),
|
|
recipient=full_msg.get("recipient", ""),
|
|
date=full_msg.get("date", ""),
|
|
body=body,
|
|
body_html=full_msg.get("body_html", ""),
|
|
flickr_url=flickr_url,
|
|
normalized_flickr_url=normalized,
|
|
wikipedia_url=wikipedia_url,
|
|
creator_profile_url=creator_profile,
|
|
))
|
|
db_session.commit()
|
|
|
|
time.sleep(1) # Be polite
|
|
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
print(f" Error downloading message {msg_id}: {e}")
|
|
continue
|
|
|
|
total = db_session.query(SentMessage).count()
|
|
print(f"Done! {total} messages in database")
|
|
|
|
except Exception:
|
|
db_session.rollback()
|
|
raise
|
|
finally:
|
|
db_session.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|