diff --git a/.gitignore b/.gitignore index 0c4323f..abfca28 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .mypy_cache __pycache__ +commons_contributions/thumbnail_cache.json +commons_contributions/sent_mail_index.json diff --git a/AGENTS.md b/AGENTS.md index ccd930b..6fe3cca 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -66,6 +66,34 @@ static image servers: Converts a Flickr username/path alias to the NSID (internal user ID) needed for the Flickr mail URL. Scrapes the user's profile page for embedded params. +### Commons Uploads Display + +Shows recent Wikimedia Commons uploads on the home page, filtered to only +those obtained via Flickr mail requests. + +**Data files** (in `commons_contributions/`): +- `flickr_uploads.json`: List of Commons uploads from Flickr with metadata +- `thumbnail_cache.json`: Cached Commons API thumbnail URLs (7-day TTL) +- `sent_mail_index.json`: Index of sent mail messages (flickr_url → wikipedia_url) + +**Key functions**: +- `build_sent_mail_index()`: Parses sent mail JSON files, extracts Flickr and + Wikipedia URLs from message bodies, caches the index +- `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match, + fetches thumbnails from Commons API +- `normalize_flickr_url()`: Normalizes URLs for matching (removes protocol, www, trailing slash) + +**CommonsUpload dataclass**: +- `title`, `thumb_url`, `commons_url`, `flickr_url`, `creator`, `timestamp` +- `wikipedia_url`, `creator_profile_url`: Extracted from sent mail +- `is_wikidata_item` property: Detects Q-number URLs +- `wiki_link_url`, `wiki_link_label`: Handles Wikidata vs Wikipedia links + +**Maintenance script** (`update_flickr_uploads.py`): +Run to find Flickr uploads from UploadWizard contributions that don't have +the Flickr URL in the edit comment. Queries Commons API for image metadata +and checks the Credit field for Flickr URLs. + ## Request Flow 1. User enters Wikipedia article title/URL → `start()` extracts article name diff --git a/README.md b/README.md index 85f1cde..1df6a66 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,9 @@ photographers on Flickr whose photos can be used to enhance Wikipedia articles. - **One-click message composition**: Click any photo to compose a permission request message with the photo displayed alongside. - **Pagination**: Browse through thousands of search results with page navigation. +- **Recent uploads showcase**: The home page displays recent Wikimedia Commons + uploads that were obtained via Flickr mail requests, with links to the + Wikipedia article and photographer's Flickr profile. - Generate messages to request permission to use photos on Wikipedia. - Handle exceptions gracefully and provide detailed error information. diff --git a/main.py b/main.py index d8a8496..5d223cf 100755 --- a/main.py +++ b/main.py @@ -6,8 +6,10 @@ import dataclasses import inspect import json import sys +import time import traceback import typing +from pathlib import Path from urllib.parse import quote, unquote import flask @@ -16,11 +18,33 @@ import werkzeug from werkzeug.debug.tbtools import DebugTraceback +import re + + app = flask.Flask(__name__) app.debug = False enwiki = "en.wikipedia.org/wiki/" +# Path to Commons contributions data and sent mail +COMMONS_UPLOADS_FILE = ( + Path(__file__).parent / "commons_contributions" / "flickr_uploads.json" +) +COMMONS_CACHE_FILE = ( + Path(__file__).parent / "commons_contributions" / "thumbnail_cache.json" +) +SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages" +SENT_MAIL_INDEX_CACHE = ( + Path(__file__).parent / "commons_contributions" / "sent_mail_index.json" +) +COMMONS_CACHE_MAX_AGE = 86400 * 7 # Cache for 7 days +RECENT_UPLOADS_COUNT = 24 + +# User agent for Commons API requests +COMMONS_USER_AGENT = ( + "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)" +) + # Browser-like headers for Flickr requests BROWSER_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", @@ -91,6 +115,289 @@ class SearchResult: total_pages: int +@dataclasses.dataclass +class CommonsUpload: + """Represents a recent upload to Wikimedia Commons.""" + + title: str + thumb_url: str + commons_url: str + flickr_url: str + creator: str + timestamp: str + wikipedia_url: str = "" + creator_profile_url: str = "" + + @property + def is_wikidata_item(self) -> bool: + """Check if the wikipedia_url is actually a Wikidata item.""" + if not self.wikipedia_url: + return False + # Match Q followed by digits at the end of the URL + return bool(re.search(r"/Q\d+$", self.wikipedia_url)) + + @property + def wiki_link_url(self) -> str: + """Get the correct URL (Wikidata if it's a Q item, otherwise Wikipedia).""" + if not self.wikipedia_url: + return "" + if self.is_wikidata_item: + # Extract Q-id and build Wikidata URL + match = re.search(r"(Q\d+)$", self.wikipedia_url) + if match: + return f"https://www.wikidata.org/wiki/{match.group(1)}" + return self.wikipedia_url + + @property + def wiki_link_label(self) -> str: + """Get the label for the wiki link.""" + return "Wikidata item" if self.is_wikidata_item else "Wikipedia article" + + +def normalize_flickr_url(url: str) -> str: + """Normalize a Flickr photo URL for comparison.""" + # Remove protocol + url = url.replace("https://", "").replace("http://", "") + # Remove www. + url = url.replace("www.", "") + # Remove trailing slash + url = url.rstrip("/") + # Ensure it starts with flickr.com + if not url.startswith("flickr.com"): + return "" + return url + + +def extract_urls_from_message(body: str) -> tuple[str, str]: + """Extract flickr URL and Wikipedia URL from message body.""" + + flickr_url = "" + wikipedia_url = "" + + # Find flickr photo URLs + flickr_pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/[^/\s]+/\d+" + flickr_matches = re.findall(flickr_pattern, body) + if flickr_matches: + flickr_url = flickr_matches[0] + if not flickr_url.startswith("http"): + flickr_url = "https://" + flickr_url + + # Find Wikipedia URLs + wiki_pattern = r"(?:https?://)?(?:www\.)?en\.wikipedia\.org/wiki/[^\s<\])]+" + wiki_matches = re.findall(wiki_pattern, body) + if wiki_matches: + wikipedia_url = wiki_matches[0] + if not wikipedia_url.startswith("http"): + wikipedia_url = "https://" + wikipedia_url + + return flickr_url, wikipedia_url + + +def build_sent_mail_index() -> dict[str, dict[str, str]]: + """Build an index of sent mail: normalized_flickr_url -> {wikipedia_url, recipient}.""" + if not SENT_MAIL_DIR.exists(): + return {} + + # Check if we have a cached index + if SENT_MAIL_INDEX_CACHE.exists(): + try: + with open(SENT_MAIL_INDEX_CACHE) as f: + cache = json.load(f) + # Check if cache is still valid (compare file count) + json_files = list(SENT_MAIL_DIR.glob("*.json")) + if cache.get("file_count") == len(json_files): + return cache.get("index", {}) + except (json.JSONDecodeError, OSError): + pass + + index: dict[str, dict[str, str]] = {} + json_files = list(SENT_MAIL_DIR.glob("*.json")) + + for json_file in json_files: + try: + with open(json_file) as f: + message = json.load(f) + except (json.JSONDecodeError, OSError): + continue + + # Skip replies - we want original requests + subject = message.get("subject", "") + if subject.startswith("Re:"): + continue + + body = message.get("body", "") + flickr_url, wikipedia_url = extract_urls_from_message(body) + + if not flickr_url: + continue + + normalized = normalize_flickr_url(flickr_url) + if not normalized: + continue + + # Extract creator profile URL from flickr URL + # flickr.com/photos/username/12345 -> flickr.com/photos/username + parts = flickr_url.split("/") + creator_profile = "" + for i, part in enumerate(parts): + if part == "photos" and i + 1 < len(parts): + username = parts[i + 1] + creator_profile = f"https://www.flickr.com/photos/{username}" + break + + index[normalized] = { + "wikipedia_url": wikipedia_url, + "creator_profile_url": creator_profile, + "recipient": message.get("recipient", ""), + } + + # Cache the index + try: + with open(SENT_MAIL_INDEX_CACHE, "w") as f: + json.dump({"file_count": len(json_files), "index": index}, f) + except OSError: + pass + + return index + + +def load_commons_thumbnail_cache() -> dict[str, typing.Any]: + """Load the thumbnail cache from disk.""" + if not COMMONS_CACHE_FILE.exists(): + return {"timestamp": 0, "thumbnails": {}} + try: + with open(COMMONS_CACHE_FILE) as f: + return typing.cast(dict[str, typing.Any], json.load(f)) + except (json.JSONDecodeError, OSError): + return {"timestamp": 0, "thumbnails": {}} + + +def save_commons_thumbnail_cache(cache: dict[str, typing.Any]) -> None: + """Save the thumbnail cache to disk.""" + try: + with open(COMMONS_CACHE_FILE, "w") as f: + json.dump(cache, f) + except OSError: + pass # Ignore cache write errors + + +def fetch_commons_thumbnails(titles: list[str]) -> dict[str, str]: + """Fetch thumbnail URLs from Commons API for the given file titles.""" + if not titles: + return {} + + # Commons API allows up to 50 titles per request + params = { + "action": "query", + "titles": "|".join(titles), + "prop": "imageinfo", + "iiprop": "url", + "iiurlwidth": 150, + "format": "json", + } + + headers = {"User-Agent": COMMONS_USER_AGENT} + + try: + response = requests.get( + "https://commons.wikimedia.org/w/api.php", + params=params, + headers=headers, + timeout=10, + ) + response.raise_for_status() + data = response.json() + except (requests.RequestException, json.JSONDecodeError): + return {} + + thumbnails: dict[str, str] = {} + pages = data.get("query", {}).get("pages", {}) + for page in pages.values(): + title = page.get("title", "") + imageinfo = page.get("imageinfo", []) + if imageinfo: + thumb_url = imageinfo[0].get("thumburl", "") + if thumb_url: + thumbnails[title] = thumb_url + + return thumbnails + + +def get_recent_commons_uploads() -> list[CommonsUpload]: + """Get recent Commons uploads with thumbnails, filtered to those contacted via Flickr mail.""" + if not COMMONS_UPLOADS_FILE.exists(): + return [] + + try: + with open(COMMONS_UPLOADS_FILE) as f: + all_uploads = json.load(f) + except (json.JSONDecodeError, OSError): + return [] + + # Build sent mail index + sent_mail_index = build_sent_mail_index() + + # Filter uploads to only those with matching sent mail + uploads_with_mail: list[dict[str, typing.Any]] = [] + for upload in all_uploads: + flickr_url = upload.get("flickr_url", "") + normalized = normalize_flickr_url(flickr_url) + if normalized and normalized in sent_mail_index: + upload["_mail_info"] = sent_mail_index[normalized] + uploads_with_mail.append(upload) + if len(uploads_with_mail) >= RECENT_UPLOADS_COUNT: + break + + if not uploads_with_mail: + return [] + + # Load cache and check if it's still valid + cache = load_commons_thumbnail_cache() + cache_age = time.time() - cache.get("timestamp", 0) + cached_thumbs = cache.get("thumbnails", {}) + + # Find which titles need fetching + titles = [u["title"] for u in uploads_with_mail] + titles_to_fetch = [t for t in titles if t not in cached_thumbs] + + # Fetch missing thumbnails or refresh if cache is old + if titles_to_fetch or cache_age > COMMONS_CACHE_MAX_AGE: + new_thumbs = fetch_commons_thumbnails( + titles if cache_age > COMMONS_CACHE_MAX_AGE else titles_to_fetch + ) + cached_thumbs.update(new_thumbs) + cache = {"timestamp": time.time(), "thumbnails": cached_thumbs} + save_commons_thumbnail_cache(cache) + + # Build the result list + result: list[CommonsUpload] = [] + for upload in uploads_with_mail: + title = upload["title"] + thumb_url = cached_thumbs.get(title, "") + if not thumb_url: + continue + + mail_info = upload.get("_mail_info", {}) + + # Convert title to Commons URL + commons_url = f"https://commons.wikimedia.org/wiki/{title.replace(' ', '_')}" + + result.append( + CommonsUpload( + title=title.replace("File:", "").rsplit(".", 1)[0], + thumb_url=thumb_url, + commons_url=commons_url, + flickr_url=upload.get("flickr_url", ""), + creator=upload.get("creator") or "Unknown", + timestamp=upload.get("timestamp", "")[:10], + wikipedia_url=mail_info.get("wikipedia_url", ""), + creator_profile_url=mail_info.get("creator_profile_url", ""), + ) + ) + + return result + + def is_valid_flickr_image_url(url: str) -> bool: """Check if URL is a valid Flickr static image URL.""" valid_prefixes = ( @@ -121,7 +428,9 @@ def search_flickr(search_term: str, page: int = 1) -> SearchResult: def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult: """Parse Flickr search results HTML and extract photo data.""" - empty_result = SearchResult(photos=[], total_photos=0, current_page=page, total_pages=0) + empty_result = SearchResult( + photos=[], total_photos=0, current_page=page, total_pages=0 + ) # Find the modelExport JSON embedded in the page start = html.find("modelExport:") @@ -265,10 +574,12 @@ def start() -> str: """Start form.""" enwp = flask.request.args.get("enwp") if not enwp: - return flask.render_template("combined.html") + recent_uploads = get_recent_commons_uploads() + return flask.render_template("combined.html", recent_uploads=recent_uploads) enwp = enwp.strip() if not enwp: - return flask.render_template("combined.html") + recent_uploads = get_recent_commons_uploads() + return flask.render_template("combined.html", recent_uploads=recent_uploads) input_is = "url" if enwiki in enwp else "title" diff --git a/templates/combined.html b/templates/combined.html index 5461c8d..c7f026a 100644 --- a/templates/combined.html +++ b/templates/combined.html @@ -15,6 +15,47 @@ + {% if recent_uploads is defined and recent_uploads and not name %} +
+
Recent uploads to Wikimedia Commons
+

Photos obtained via Flickr mail requests

+
+ {% for upload in recent_uploads %} +
+
+
+
+ + {{ upload.title }} + +
+
+
+

+ {{ upload.title }} +

+

+ {% if upload.creator_profile_url %} + {{ upload.creator }} + {% else %} + {{ upload.creator }} + {% endif %} +

+ {% if upload.wikipedia_url %} +

+ {{ upload.wiki_link_label }} +

+ {% endif %} +
+
+
+
+
+ {% endfor %} +
+
+ {% endif %} + {% if name and search_result is defined and search_result.photos %}

Wikipedia article: {{ name }}

diff --git a/update_flickr_uploads.py b/update_flickr_uploads.py new file mode 100644 index 0000000..d140b3c --- /dev/null +++ b/update_flickr_uploads.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json. + +For contributions with comment 'User created page with UploadWizard', queries the +Commons API to check if the image source is Flickr (by checking the Credit field). +""" + +import json +import re +import time +from pathlib import Path + +import requests + +CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json") +FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json") +COMMONS_API = "https://commons.wikimedia.org/w/api.php" +USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)" + + +def extract_flickr_url_from_credit(credit: str) -> str | None: + """Extract Flickr URL from the Credit field HTML.""" + pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+' + match = re.search(pattern, credit) + return match.group(0) if match else None + + +def get_image_metadata(titles: list[str]) -> dict[str, dict]: + """Fetch image metadata from Commons API for multiple titles.""" + if not titles: + return {} + + # Commons API allows up to 50 titles per request + params = { + "action": "query", + "titles": "|".join(titles), + "prop": "imageinfo", + "iiprop": "extmetadata", + "format": "json", + } + + headers = {"User-Agent": USER_AGENT} + + try: + response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + except (requests.RequestException, json.JSONDecodeError) as e: + print(f"API error: {e}") + return {} + + results = {} + pages = data.get("query", {}).get("pages", {}) + for page in pages.values(): + title = page.get("title", "") + imageinfo = page.get("imageinfo", []) + if imageinfo: + extmeta = imageinfo[0].get("extmetadata", {}) + results[title] = { + "credit": extmeta.get("Credit", {}).get("value", ""), + "artist": extmeta.get("Artist", {}).get("value", ""), + } + + return results + + +def clean_artist_name(artist_html: str) -> str: + """Extract plain text artist name from HTML.""" + # Remove HTML tags + text = re.sub(r"<[^>]+>", "", artist_html) + # Clean up whitespace + text = " ".join(text.split()) + return text + + +def main(): + # Load contributions + print("Loading contributions...") + with open(CONTRIBUTIONS_FILE) as f: + data = json.load(f) + + contributions = data.get("contributions", []) + + # Load existing flickr uploads + existing_flickr_urls = set() + existing_uploads = [] + if FLICKR_UPLOADS_FILE.exists(): + with open(FLICKR_UPLOADS_FILE) as f: + existing_uploads = json.load(f) + existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads} + # Also normalize existing URLs for comparison + for u in existing_uploads: + url = u.get("flickr_url", "") + normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/") + existing_flickr_urls.add(normalized) + + print(f"Existing uploads: {len(existing_uploads)}") + print(f"Existing flickr URLs: {len(existing_flickr_urls)}") + + # Find UploadWizard contributions (page creations only) + upload_wizard_contributions = [] + for c in contributions: + comment = c.get("comment", "") + if comment == "User created page with UploadWizard": + # Only include if it's a File: page + title = c.get("title", "") + if title.startswith("File:"): + upload_wizard_contributions.append(c) + + print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}") + + # Process in batches of 50 + new_uploads = [] + batch_size = 50 + + for i in range(0, len(upload_wizard_contributions), batch_size): + batch = upload_wizard_contributions[i : i + batch_size] + titles = [c["title"] for c in batch] + + print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...") + + metadata = get_image_metadata(titles) + + for c in batch: + title = c["title"] + meta = metadata.get(title, {}) + credit = meta.get("credit", "") + artist = meta.get("artist", "") + + flickr_url = extract_flickr_url_from_credit(credit) + if not flickr_url: + continue + + # Check if we already have this URL + normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/") + if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls: + continue + + creator = clean_artist_name(artist) if artist else None + + new_upload = { + "pageid": c["pageid"], + "revid": c["revid"], + "title": title, + "timestamp": c["timestamp"], + "flickr_url": flickr_url, + "creator": creator, + } + + new_uploads.append(new_upload) + existing_flickr_urls.add(normalized) + print(f" Found: {title[:50]} -> {flickr_url}") + + # Rate limiting + if i + batch_size < len(upload_wizard_contributions): + time.sleep(0.5) + + print(f"\nFound {len(new_uploads)} new Flickr uploads") + + if new_uploads: + # Merge and sort by timestamp (newest first) + all_uploads = existing_uploads + new_uploads + all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True) + + # Save + with open(FLICKR_UPLOADS_FILE, "w") as f: + json.dump(all_uploads, f, indent=2) + + print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}") + + +if __name__ == "__main__": + main()