flickr-mail/update_flickr_uploads.py
Edward Betts a2d29d7937 Show recent Commons uploads obtained via Flickr mail
Display recent Wikimedia Commons uploads on the home page, filtered to
only show images that were obtained by contacting creators via Flickr
mail. Each upload shows:
- Thumbnail linking to Commons
- Creator name linking to their Flickr profile
- Link to the illustrated Wikipedia article (or Wikidata item)

Features:
- Parse sent mail messages to extract Flickr and Wikipedia URLs
- Match Commons uploads with sent mail by normalized Flickr URL
- Cache Commons API thumbnail responses and sent mail index
- Handle Wikidata item URLs (Q-numbers) with correct links
- Add update_flickr_uploads.py script to find uploads from UploadWizard
  contributions by checking Commons API metadata

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-06 10:43:45 +00:00

174 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
For contributions with comment 'User created page with UploadWizard', queries the
Commons API to check if the image source is Flickr (by checking the Credit field).
"""
import json
import re
import time
from pathlib import Path
import requests
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
def extract_flickr_url_from_credit(credit: str) -> str | None:
"""Extract Flickr URL from the Credit field HTML."""
pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+'
match = re.search(pattern, credit)
return match.group(0) if match else None
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
"""Fetch image metadata from Commons API for multiple titles."""
if not titles:
return {}
# Commons API allows up to 50 titles per request
params = {
"action": "query",
"titles": "|".join(titles),
"prop": "imageinfo",
"iiprop": "extmetadata",
"format": "json",
}
headers = {"User-Agent": USER_AGENT}
try:
response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
except (requests.RequestException, json.JSONDecodeError) as e:
print(f"API error: {e}")
return {}
results = {}
pages = data.get("query", {}).get("pages", {})
for page in pages.values():
title = page.get("title", "")
imageinfo = page.get("imageinfo", [])
if imageinfo:
extmeta = imageinfo[0].get("extmetadata", {})
results[title] = {
"credit": extmeta.get("Credit", {}).get("value", ""),
"artist": extmeta.get("Artist", {}).get("value", ""),
}
return results
def clean_artist_name(artist_html: str) -> str:
"""Extract plain text artist name from HTML."""
# Remove HTML tags
text = re.sub(r"<[^>]+>", "", artist_html)
# Clean up whitespace
text = " ".join(text.split())
return text
def main():
# Load contributions
print("Loading contributions...")
with open(CONTRIBUTIONS_FILE) as f:
data = json.load(f)
contributions = data.get("contributions", [])
# Load existing flickr uploads
existing_flickr_urls = set()
existing_uploads = []
if FLICKR_UPLOADS_FILE.exists():
with open(FLICKR_UPLOADS_FILE) as f:
existing_uploads = json.load(f)
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
# Also normalize existing URLs for comparison
for u in existing_uploads:
url = u.get("flickr_url", "")
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
existing_flickr_urls.add(normalized)
print(f"Existing uploads: {len(existing_uploads)}")
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
# Find UploadWizard contributions (page creations only)
upload_wizard_contributions = []
for c in contributions:
comment = c.get("comment", "")
if comment == "User created page with UploadWizard":
# Only include if it's a File: page
title = c.get("title", "")
if title.startswith("File:"):
upload_wizard_contributions.append(c)
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
# Process in batches of 50
new_uploads = []
batch_size = 50
for i in range(0, len(upload_wizard_contributions), batch_size):
batch = upload_wizard_contributions[i : i + batch_size]
titles = [c["title"] for c in batch]
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
metadata = get_image_metadata(titles)
for c in batch:
title = c["title"]
meta = metadata.get(title, {})
credit = meta.get("credit", "")
artist = meta.get("artist", "")
flickr_url = extract_flickr_url_from_credit(credit)
if not flickr_url:
continue
# Check if we already have this URL
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
continue
creator = clean_artist_name(artist) if artist else None
new_upload = {
"pageid": c["pageid"],
"revid": c["revid"],
"title": title,
"timestamp": c["timestamp"],
"flickr_url": flickr_url,
"creator": creator,
}
new_uploads.append(new_upload)
existing_flickr_urls.add(normalized)
print(f" Found: {title[:50]} -> {flickr_url}")
# Rate limiting
if i + batch_size < len(upload_wizard_contributions):
time.sleep(0.5)
print(f"\nFound {len(new_uploads)} new Flickr uploads")
if new_uploads:
# Merge and sort by timestamp (newest first)
all_uploads = existing_uploads + new_uploads
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
# Save
with open(FLICKR_UPLOADS_FILE, "w") as f:
json.dump(all_uploads, f, indent=2)
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
if __name__ == "__main__":
main()