flickr-mail/update_flickr_uploads.py

#!/usr/bin/env python3
"""
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.

For contributions with comment 'User created page with UploadWizard', queries the
Commons API to check if the image source is Flickr (by checking the Credit field).
"""

import json
import re
import time
from pathlib import Path

import requests

CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"


def extract_flickr_url_from_credit(credit: str) -> str | None:
    """Extract Flickr URL from the Credit field HTML."""
    pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+'
    match = re.search(pattern, credit)
    return match.group(0) if match else None


def get_image_metadata(titles: list[str]) -> dict[str, dict]:
    """Fetch image metadata from Commons API for multiple titles."""
    if not titles:
        return {}

    # Commons API allows up to 50 titles per request
    params = {
        "action": "query",
        "titles": "|".join(titles),
        "prop": "imageinfo",
        "iiprop": "extmetadata",
        "format": "json",
    }

    headers = {"User-Agent": USER_AGENT}

    try:
        response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()
    except (requests.RequestException, json.JSONDecodeError) as e:
        print(f"API error: {e}")
        return {}

    results = {}
    pages = data.get("query", {}).get("pages", {})
    for page in pages.values():
        title = page.get("title", "")
        imageinfo = page.get("imageinfo", [])
        if imageinfo:
            extmeta = imageinfo[0].get("extmetadata", {})
            results[title] = {
                "credit": extmeta.get("Credit", {}).get("value", ""),
                "artist": extmeta.get("Artist", {}).get("value", ""),
            }

    return results


def clean_artist_name(artist_html: str) -> str:
    """Extract plain text artist name from HTML."""
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", "", artist_html)
    # Clean up whitespace
    text = " ".join(text.split())
    return text


def main():
    # Load contributions
    print("Loading contributions...")
    with open(CONTRIBUTIONS_FILE) as f:
        data = json.load(f)

    contributions = data.get("contributions", [])

    # Load existing flickr uploads
    existing_flickr_urls = set()
    existing_uploads = []
    if FLICKR_UPLOADS_FILE.exists():
        with open(FLICKR_UPLOADS_FILE) as f:
            existing_uploads = json.load(f)
            existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
            # Also normalize existing URLs for comparison
            for u in existing_uploads:
                url = u.get("flickr_url", "")
                normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
                existing_flickr_urls.add(normalized)

    print(f"Existing uploads: {len(existing_uploads)}")
    print(f"Existing flickr URLs: {len(existing_flickr_urls)}")

    # Find UploadWizard contributions (page creations only)
    upload_wizard_contributions = []
    for c in contributions:
        comment = c.get("comment", "")
        if comment == "User created page with UploadWizard":
            # Only include if it's a File: page
            title = c.get("title", "")
            if title.startswith("File:"):
                upload_wizard_contributions.append(c)

    print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")

    # Process in batches of 50
    new_uploads = []
    batch_size = 50

    for i in range(0, len(upload_wizard_contributions), batch_size):
        batch = upload_wizard_contributions[i : i + batch_size]
        titles = [c["title"] for c in batch]

        print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")

        metadata = get_image_metadata(titles)

        for c in batch:
            title = c["title"]
            meta = metadata.get(title, {})
            credit = meta.get("credit", "")
            artist = meta.get("artist", "")

            flickr_url = extract_flickr_url_from_credit(credit)
            if not flickr_url:
                continue

            # Check if we already have this URL
            normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
            if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
                continue

            creator = clean_artist_name(artist) if artist else None

            new_upload = {
                "pageid": c["pageid"],
                "revid": c["revid"],
                "title": title,
                "timestamp": c["timestamp"],
                "flickr_url": flickr_url,
                "creator": creator,
            }

            new_uploads.append(new_upload)
            existing_flickr_urls.add(normalized)
            print(f"  Found: {title[:50]} -> {flickr_url}")

        # Rate limiting
        if i + batch_size < len(upload_wizard_contributions):
            time.sleep(0.5)

    print(f"\nFound {len(new_uploads)} new Flickr uploads")

    if new_uploads:
        # Merge and sort by timestamp (newest first)
        all_uploads = existing_uploads + new_uploads
        all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)

        # Save
        with open(FLICKR_UPLOADS_FILE, "w") as f:
            json.dump(all_uploads, f, indent=2)

        print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")


if __name__ == "__main__":
    main()