Show recent Commons uploads obtained via Flickr mail
Display recent Wikimedia Commons uploads on the home page, filtered to only show images that were obtained by contacting creators via Flickr mail. Each upload shows: - Thumbnail linking to Commons - Creator name linking to their Flickr profile - Link to the illustrated Wikipedia article (or Wikidata item) Features: - Parse sent mail messages to extract Flickr and Wikipedia URLs - Match Commons uploads with sent mail by normalized Flickr URL - Cache Commons API thumbnail responses and sent mail index - Handle Wikidata item URLs (Q-numbers) with correct links - Add update_flickr_uploads.py script to find uploads from UploadWizard contributions by checking Commons API metadata Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
0062de8ede
commit
a2d29d7937
6 changed files with 562 additions and 3 deletions
174
update_flickr_uploads.py
Normal file
174
update_flickr_uploads.py
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
||||
|
||||
For contributions with comment 'User created page with UploadWizard', queries the
|
||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
||||
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
||||
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
||||
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||
|
||||
|
||||
def extract_flickr_url_from_credit(credit: str) -> str | None:
|
||||
"""Extract Flickr URL from the Credit field HTML."""
|
||||
pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+'
|
||||
match = re.search(pattern, credit)
|
||||
return match.group(0) if match else None
|
||||
|
||||
|
||||
def get_image_metadata(titles: list[str]) -> dict[str, dict]:
|
||||
"""Fetch image metadata from Commons API for multiple titles."""
|
||||
if not titles:
|
||||
return {}
|
||||
|
||||
# Commons API allows up to 50 titles per request
|
||||
params = {
|
||||
"action": "query",
|
||||
"titles": "|".join(titles),
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "extmetadata",
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
|
||||
try:
|
||||
response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
print(f"API error: {e}")
|
||||
return {}
|
||||
|
||||
results = {}
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
title = page.get("title", "")
|
||||
imageinfo = page.get("imageinfo", [])
|
||||
if imageinfo:
|
||||
extmeta = imageinfo[0].get("extmetadata", {})
|
||||
results[title] = {
|
||||
"credit": extmeta.get("Credit", {}).get("value", ""),
|
||||
"artist": extmeta.get("Artist", {}).get("value", ""),
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def clean_artist_name(artist_html: str) -> str:
|
||||
"""Extract plain text artist name from HTML."""
|
||||
# Remove HTML tags
|
||||
text = re.sub(r"<[^>]+>", "", artist_html)
|
||||
# Clean up whitespace
|
||||
text = " ".join(text.split())
|
||||
return text
|
||||
|
||||
|
||||
def main():
|
||||
# Load contributions
|
||||
print("Loading contributions...")
|
||||
with open(CONTRIBUTIONS_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
contributions = data.get("contributions", [])
|
||||
|
||||
# Load existing flickr uploads
|
||||
existing_flickr_urls = set()
|
||||
existing_uploads = []
|
||||
if FLICKR_UPLOADS_FILE.exists():
|
||||
with open(FLICKR_UPLOADS_FILE) as f:
|
||||
existing_uploads = json.load(f)
|
||||
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
||||
# Also normalize existing URLs for comparison
|
||||
for u in existing_uploads:
|
||||
url = u.get("flickr_url", "")
|
||||
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
existing_flickr_urls.add(normalized)
|
||||
|
||||
print(f"Existing uploads: {len(existing_uploads)}")
|
||||
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
||||
|
||||
# Find UploadWizard contributions (page creations only)
|
||||
upload_wizard_contributions = []
|
||||
for c in contributions:
|
||||
comment = c.get("comment", "")
|
||||
if comment == "User created page with UploadWizard":
|
||||
# Only include if it's a File: page
|
||||
title = c.get("title", "")
|
||||
if title.startswith("File:"):
|
||||
upload_wizard_contributions.append(c)
|
||||
|
||||
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
||||
|
||||
# Process in batches of 50
|
||||
new_uploads = []
|
||||
batch_size = 50
|
||||
|
||||
for i in range(0, len(upload_wizard_contributions), batch_size):
|
||||
batch = upload_wizard_contributions[i : i + batch_size]
|
||||
titles = [c["title"] for c in batch]
|
||||
|
||||
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
||||
|
||||
metadata = get_image_metadata(titles)
|
||||
|
||||
for c in batch:
|
||||
title = c["title"]
|
||||
meta = metadata.get(title, {})
|
||||
credit = meta.get("credit", "")
|
||||
artist = meta.get("artist", "")
|
||||
|
||||
flickr_url = extract_flickr_url_from_credit(credit)
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
# Check if we already have this URL
|
||||
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
||||
continue
|
||||
|
||||
creator = clean_artist_name(artist) if artist else None
|
||||
|
||||
new_upload = {
|
||||
"pageid": c["pageid"],
|
||||
"revid": c["revid"],
|
||||
"title": title,
|
||||
"timestamp": c["timestamp"],
|
||||
"flickr_url": flickr_url,
|
||||
"creator": creator,
|
||||
}
|
||||
|
||||
new_uploads.append(new_upload)
|
||||
existing_flickr_urls.add(normalized)
|
||||
print(f" Found: {title[:50]} -> {flickr_url}")
|
||||
|
||||
# Rate limiting
|
||||
if i + batch_size < len(upload_wizard_contributions):
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
||||
|
||||
if new_uploads:
|
||||
# Merge and sort by timestamp (newest first)
|
||||
all_uploads = existing_uploads + new_uploads
|
||||
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
|
||||
# Save
|
||||
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
||||
json.dump(all_uploads, f, indent=2)
|
||||
|
||||
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue