Display recent Wikimedia Commons uploads on the home page, filtered to only show images that were obtained by contacting creators via Flickr mail. Each upload shows: - Thumbnail linking to Commons - Creator name linking to their Flickr profile - Link to the illustrated Wikipedia article (or Wikidata item) Features: - Parse sent mail messages to extract Flickr and Wikipedia URLs - Match Commons uploads with sent mail by normalized Flickr URL - Cache Commons API thumbnail responses and sent mail index - Handle Wikidata item URLs (Q-numbers) with correct links - Add update_flickr_uploads.py script to find uploads from UploadWizard contributions by checking Commons API metadata Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
174 lines
5.8 KiB
Python
174 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
|
|
|
For contributions with comment 'User created page with UploadWizard', queries the
|
|
Commons API to check if the image source is Flickr (by checking the Credit field).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
|
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
|
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
|
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
|
|
|
|
|
def extract_flickr_url_from_credit(credit: str) -> str | None:
|
|
"""Extract Flickr URL from the Credit field HTML."""
|
|
pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+'
|
|
match = re.search(pattern, credit)
|
|
return match.group(0) if match else None
|
|
|
|
|
|
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
|
|
"""Fetch image metadata from Commons API for multiple titles."""
|
|
if not titles:
|
|
return {}
|
|
|
|
# Commons API allows up to 50 titles per request
|
|
params = {
|
|
"action": "query",
|
|
"titles": "|".join(titles),
|
|
"prop": "imageinfo",
|
|
"iiprop": "extmetadata",
|
|
"format": "json",
|
|
}
|
|
|
|
headers = {"User-Agent": USER_AGENT}
|
|
|
|
try:
|
|
response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
except (requests.RequestException, json.JSONDecodeError) as e:
|
|
print(f"API error: {e}")
|
|
return {}
|
|
|
|
results = {}
|
|
pages = data.get("query", {}).get("pages", {})
|
|
for page in pages.values():
|
|
title = page.get("title", "")
|
|
imageinfo = page.get("imageinfo", [])
|
|
if imageinfo:
|
|
extmeta = imageinfo[0].get("extmetadata", {})
|
|
results[title] = {
|
|
"credit": extmeta.get("Credit", {}).get("value", ""),
|
|
"artist": extmeta.get("Artist", {}).get("value", ""),
|
|
}
|
|
|
|
return results
|
|
|
|
|
|
def clean_artist_name(artist_html: str) -> str:
|
|
"""Extract plain text artist name from HTML."""
|
|
# Remove HTML tags
|
|
text = re.sub(r"<[^>]+>", "", artist_html)
|
|
# Clean up whitespace
|
|
text = " ".join(text.split())
|
|
return text
|
|
|
|
|
|
def main():
|
|
# Load contributions
|
|
print("Loading contributions...")
|
|
with open(CONTRIBUTIONS_FILE) as f:
|
|
data = json.load(f)
|
|
|
|
contributions = data.get("contributions", [])
|
|
|
|
# Load existing flickr uploads
|
|
existing_flickr_urls = set()
|
|
existing_uploads = []
|
|
if FLICKR_UPLOADS_FILE.exists():
|
|
with open(FLICKR_UPLOADS_FILE) as f:
|
|
existing_uploads = json.load(f)
|
|
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
|
# Also normalize existing URLs for comparison
|
|
for u in existing_uploads:
|
|
url = u.get("flickr_url", "")
|
|
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
|
existing_flickr_urls.add(normalized)
|
|
|
|
print(f"Existing uploads: {len(existing_uploads)}")
|
|
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
|
|
|
# Find UploadWizard contributions (page creations only)
|
|
upload_wizard_contributions = []
|
|
for c in contributions:
|
|
comment = c.get("comment", "")
|
|
if comment == "User created page with UploadWizard":
|
|
# Only include if it's a File: page
|
|
title = c.get("title", "")
|
|
if title.startswith("File:"):
|
|
upload_wizard_contributions.append(c)
|
|
|
|
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
|
|
|
# Process in batches of 50
|
|
new_uploads = []
|
|
batch_size = 50
|
|
|
|
for i in range(0, len(upload_wizard_contributions), batch_size):
|
|
batch = upload_wizard_contributions[i : i + batch_size]
|
|
titles = [c["title"] for c in batch]
|
|
|
|
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
|
|
|
metadata = get_image_metadata(titles)
|
|
|
|
for c in batch:
|
|
title = c["title"]
|
|
meta = metadata.get(title, {})
|
|
credit = meta.get("credit", "")
|
|
artist = meta.get("artist", "")
|
|
|
|
flickr_url = extract_flickr_url_from_credit(credit)
|
|
if not flickr_url:
|
|
continue
|
|
|
|
# Check if we already have this URL
|
|
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
|
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
|
continue
|
|
|
|
creator = clean_artist_name(artist) if artist else None
|
|
|
|
new_upload = {
|
|
"pageid": c["pageid"],
|
|
"revid": c["revid"],
|
|
"title": title,
|
|
"timestamp": c["timestamp"],
|
|
"flickr_url": flickr_url,
|
|
"creator": creator,
|
|
}
|
|
|
|
new_uploads.append(new_upload)
|
|
existing_flickr_urls.add(normalized)
|
|
print(f" Found: {title[:50]} -> {flickr_url}")
|
|
|
|
# Rate limiting
|
|
if i + batch_size < len(upload_wizard_contributions):
|
|
time.sleep(0.5)
|
|
|
|
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
|
|
|
if new_uploads:
|
|
# Merge and sort by timestamp (newest first)
|
|
all_uploads = existing_uploads + new_uploads
|
|
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
|
|
|
# Save
|
|
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
|
json.dump(all_uploads, f, indent=2)
|
|
|
|
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|