flickr-mail/download_sent_mail.py

#!/usr/bin/env python3
"""Download sent FlickrMail messages for backup."""

import json
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup

from flickr_mail.database import init_db, get_session
from flickr_mail.models import SentMessage
from flickr_mail.url_utils import (
    creator_profile_from_flickr_url,
    extract_urls_from_message,
    normalize_flickr_url,
)

BASE_URL = "https://www.flickr.com"
SENT_MAIL_URL = f"{BASE_URL}/mail/sent/page{{page}}"
MESSAGE_URL = f"{BASE_URL}/mail/sent/{{message_id}}"
MAX_SENT_MAIL_PAGES = 29  # Fallback upper bound if we need to backfill everything
CONFIG_FILE = Path(__file__).with_name("download_sent_mail.local.json")
EXAMPLE_CONFIG_FILE = Path(__file__).with_name("download_sent_mail.example.json")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Priority": "u=0, i",
}

def load_cookie_string() -> str:
    """Load Flickr cookies string from local JSON config."""
    if not CONFIG_FILE.exists():
        raise RuntimeError(
            f"Missing config file: {CONFIG_FILE}. "
            f"Copy {EXAMPLE_CONFIG_FILE.name} to {CONFIG_FILE.name} and set cookies_str."
        )

    try:
        data = json.loads(CONFIG_FILE.read_text())
    except json.JSONDecodeError as exc:
        raise RuntimeError(f"Invalid JSON in {CONFIG_FILE}: {exc}") from exc

    cookie_str = data.get("cookies_str", "").strip()
    if not cookie_str:
        raise RuntimeError(f"{CONFIG_FILE} must contain a non-empty 'cookies_str' value")
    return cookie_str


def parse_cookies(cookie_str: str) -> dict[str, str]:
    """Parse cookie string into dictionary."""
    cookies = {}
    for item in cookie_str.split("; "):
        if "=" in item:
            key, value = item.split("=", 1)
            cookies[key] = value
    return cookies


def create_session() -> requests.Session:
    """Create a requests session with authentication."""
    session = requests.Session()
    session.headers.update(HEADERS)
    session.cookies.update(parse_cookies(load_cookie_string()))
    return session


def fetch_page(session: requests.Session, url: str) -> BeautifulSoup:
    """Fetch a page and return parsed HTML."""
    response = session.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def extract_messages_from_list_page(soup: BeautifulSoup) -> list[dict]:
    """Extract message metadata from a sent mail list page."""
    messages = []

    # Find all message rows: <tr class="message_row sent" id="message_row_XXXX">
    mail_rows = soup.select("tr.message_row")

    for row in mail_rows:
        msg = {}

        # Get message ID from the row id attribute
        row_id = row.get("id", "")
        if row_id.startswith("message_row_"):
            msg["message_id"] = row_id.replace("message_row_", "")

        # Find message link in the subject cell
        subj_cell = row.select_one("td.subj")
        if subj_cell:
            link = subj_cell.find("a")
            if link:
                msg["subject"] = link.get_text(strip=True)
                msg["url"] = BASE_URL + link["href"]

        # Recipient is in td.fromto
        fromto_cell = row.select_one("td.fromto")
        if fromto_cell:
            msg["recipient"] = fromto_cell.get_text(strip=True)

        # Date is in td.date
        date_cell = row.select_one("td.date")
        if date_cell:
            msg["date"] = date_cell.get_text(strip=True)

        if "message_id" in msg:
            messages.append(msg)

    return messages


def extract_message_content(soup: BeautifulSoup) -> dict:
    """Extract full message content from a message page."""
    content = {}

    # Find the ThinCase div containing the message
    thin_case = soup.select_one(".ThinCase")
    if not thin_case:
        return content

    # Find the table with message content
    table = thin_case.find("table")
    if not table:
        return content

    rows = table.find_all("tr", recursive=False)

    # Row 0: To: <recipient>
    # Row 1: Subject: <subject>
    # Row 2: <empty> <body>
    for row in rows:
        cells = row.find_all("td", recursive=False)
        if len(cells) >= 2:
            header_cell = cells[0]
            value_cell = cells[1]

            header = header_cell.get_text(strip=True).lower()

            if header == "to:":
                # Get recipient username
                username = value_cell.select_one(".username")
                if username:
                    content["recipient"] = username.get_text(strip=True)

            elif header == "subject:":
                # Get subject from h3
                h3 = value_cell.find("h3")
                if h3:
                    content["subject"] = h3.get_text(strip=True)

            elif header == "":
                # This is the message body row (empty header cell)
                # Get the content but exclude the delete form
                form = value_cell.find("form")
                if form:
                    form.decompose()

                content["body"] = value_cell.get_text(separator="\n", strip=True)
                content["body_html"] = str(value_cell)
                break  # Body found, stop processing

    return content


def main() -> None:
    """Main entry point."""
    init_db()
    db_session = get_session()

    try:
        existing_ids = {
            r[0] for r in db_session.query(SentMessage.message_id).all()
        }
        print(f"Database has {len(existing_ids)} messages")

        http_session = create_session()

        new_messages: list[dict] = []
        stop_fetching = False

        print("Fetching message list until we reach existing messages...")
        for page in range(1, MAX_SENT_MAIL_PAGES + 1):
            url = SENT_MAIL_URL.format(page=page)
            print(f"  Fetching page {page}...")

            try:
                soup = fetch_page(http_session, url)
                page_messages = extract_messages_from_list_page(soup)

                if not page_messages:
                    print("  No messages found on this page, stopping")
                    break

                page_new_messages = 0
                for msg in page_messages:
                    msg_id = msg.get("message_id")
                    if not msg_id:
                        continue
                    if msg_id in existing_ids:
                        stop_fetching = True
                        break

                    new_messages.append(msg)
                    page_new_messages += 1

                if stop_fetching:
                    print("  Reached messages already in the database, stopping pagination")
                    break

                if page_new_messages == 0:
                    print("  No new messages on this page, stopping pagination")
                    break

                time.sleep(1)  # Be polite to the server

            except Exception as e:
                print(f"  Error fetching page {page}: {e}")
                continue

        print(f"Found {len(new_messages)} new messages to download")

        # Download individual messages
        for i, msg in enumerate(new_messages, 1):
            msg_id = msg["message_id"]
            url = msg.get("url") or MESSAGE_URL.format(message_id=msg_id)

            print(f"  [{i}/{len(new_messages)}] Downloading message {msg_id}...")

            try:
                soup = fetch_page(http_session, url)
                content = extract_message_content(soup)

                # Merge with metadata
                full_msg = {**msg, **content}

                body = full_msg.get("body", "")
                flickr_url, wikipedia_url = extract_urls_from_message(body)
                normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
                creator_profile = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""

                db_session.add(SentMessage(
                    message_id=msg_id,
                    subject=full_msg.get("subject", ""),
                    url=full_msg.get("url", ""),
                    recipient=full_msg.get("recipient", ""),
                    date=full_msg.get("date", ""),
                    body=body,
                    body_html=full_msg.get("body_html", ""),
                    flickr_url=flickr_url,
                    normalized_flickr_url=normalized,
                    wikipedia_url=wikipedia_url,
                    creator_profile_url=creator_profile,
                ))
                db_session.commit()

                time.sleep(1)  # Be polite

            except Exception as e:
                db_session.rollback()
                print(f"    Error downloading message {msg_id}: {e}")
                continue

        total = db_session.query(SentMessage).count()
        print(f"Done! {total} messages in database")

    except Exception:
        db_session.rollback()
        raise
    finally:
        db_session.close()


if __name__ == "__main__":
    main()