#!/usr/bin/env python3 """ Find UploadWizard contributions that are from Flickr and add them to the database. For contributions with comment 'User created page with UploadWizard', queries the Commons API to check if the image source is Flickr (by checking the Credit field). """ import json import re import time import requests from flickr_mail.database import init_db, get_session from flickr_mail.models import Contribution, FlickrUpload, SentMessage from flickr_mail.url_utils import normalize_flickr_url COMMONS_API = "https://commons.wikimedia.org/w/api.php" USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)" def extract_flickr_url_from_credit(credit: str) -> str | None: """Extract Flickr URL from the Credit field HTML.""" pattern = r'https?://(?:www\.)?flickr\.com/photos/[^/"\s<>]+/\d+' match = re.search(pattern, credit) return match.group(0) if match else None def get_image_metadata(titles: list[str]) -> dict[str, dict]: """Fetch image metadata from Commons API for multiple titles.""" if not titles: return {} # Commons API allows up to 50 titles per request params = { "action": "query", "titles": "|".join(titles), "prop": "imageinfo", "iiprop": "extmetadata", "format": "json", } headers = {"User-Agent": USER_AGENT} try: response = requests.get(COMMONS_API, params=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() except (requests.RequestException, json.JSONDecodeError) as e: print(f"API error: {e}") return {} results = {} pages = data.get("query", {}).get("pages", {}) for page in pages.values(): title = page.get("title", "") imageinfo = page.get("imageinfo", []) if imageinfo: extmeta = imageinfo[0].get("extmetadata", {}) results[title] = { "credit": extmeta.get("Credit", {}).get("value", ""), "artist": extmeta.get("Artist", {}).get("value", ""), } return results def clean_artist_name(artist_html: str) -> str: """Extract plain text artist name from HTML.""" # Remove HTML tags text = re.sub(r"<[^>]+>", "", artist_html) # Clean up whitespace text = " ".join(text.split()) return text def main(): init_db() session = get_session() try: # Get existing normalized flickr URLs to avoid duplicates existing_urls = { r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all() if r[0] } print(f"Existing uploads: {session.query(FlickrUpload).count()}") print(f"Existing flickr URLs: {len(existing_urls)}") # Build sent message index for FK linking sent_messages = ( session.query(SentMessage) .filter(SentMessage.normalized_flickr_url != "") .filter(~SentMessage.subject.startswith("Re:")) .all() ) url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages} # Find UploadWizard contributions (page creations only) upload_wizard = ( session.query(Contribution) .filter(Contribution.comment == "User created page with UploadWizard") .filter(Contribution.title.startswith("File:")) .all() ) print(f"UploadWizard contributions to check: {len(upload_wizard)}") # Process in batches of 50 new_count = 0 batch_size = 50 for i in range(0, len(upload_wizard), batch_size): batch = upload_wizard[i : i + batch_size] titles = [c.title for c in batch] print( f"Processing batch {i // batch_size + 1}/" f"{(len(upload_wizard) + batch_size - 1) // batch_size}..." ) metadata = get_image_metadata(titles) for c in batch: meta = metadata.get(c.title, {}) credit = meta.get("credit", "") artist = meta.get("artist", "") flickr_url = extract_flickr_url_from_credit(credit) if not flickr_url: continue normalized = normalize_flickr_url(flickr_url) if normalized in existing_urls: continue creator = clean_artist_name(artist) if artist else None # Look up sent message for FK linking msg = url_to_message.get(normalized) if normalized else None session.add(FlickrUpload( pageid=c.pageid, revid=c.revid, title=c.title, timestamp=c.timestamp, flickr_url=flickr_url, normalized_flickr_url=normalized, creator=creator, wikipedia_url=msg.wikipedia_url if msg else "", creator_profile_url=msg.creator_profile_url if msg else "", sent_message_id=msg.message_id if msg else None, )) new_count += 1 existing_urls.add(normalized) print(f" Found: {c.title[:50]} -> {flickr_url}") session.commit() # Rate limiting if i + batch_size < len(upload_wizard): time.sleep(0.5) total = session.query(FlickrUpload).count() print(f"\nFound {new_count} new Flickr uploads") print(f"Total: {total} uploads in database") except Exception: session.rollback() raise finally: session.close() if __name__ == "__main__": main()