#!/usr/bin/env python3 """ Extract Flickr uploads from Wikimedia Commons contributions. Filters contributions where the comment contains a flickr.com URL and extracts: - pageid, revid, title, timestamp - flickr_url: the Flickr photo URL - creator: the photographer/author name Links uploads to sent messages via normalized Flickr URL matching. """ import re from flickr_mail.database import init_db, get_session from flickr_mail.models import Contribution, FlickrUpload, SentMessage from flickr_mail.url_utils import normalize_flickr_url def extract_flickr_url(comment: str) -> str | None: """Extract the Flickr photo URL from a comment.""" # Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/ # Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup patterns = [ # Plain URL (modern format) r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?', # URL in wiki markup [url title] r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]', ] for pattern in patterns: match = re.search(pattern, comment) if match: return match.group(1) return None def extract_creator(comment: str) -> str | None: """Extract the creator/author name from a comment.""" # Modern format: "Uploaded a work by {creator} from https://..." modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment) if modern_match: return modern_match.group(1).strip() # Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location # The author name comes after the URL, before ] or "from" author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment) if author_match: author = author_match.group(1).strip() # Remove trailing location like "from Toronto, Canada" author = re.sub(r'\s+from\s+.+$', '', author) return author # Handle truncated comments where Author field is cut off # Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete) truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment) if truncated_match: author = truncated_match.group(1).strip() if author: return author # Sometimes Author field is just plain text without URL author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment) if author_plain: author = author_plain.group(1).strip() # Skip if it looks like a wiki user link if not author.startswith('[[User:') and author: return author return None def main() -> None: """Process contributions and extract Flickr uploads.""" init_db() session = get_session() try: # Get existing upload revids to avoid duplicates existing_revids = { r[0] for r in session.query(FlickrUpload.revid).all() } # Build sent message index: normalized_flickr_url -> message sent_messages = ( session.query(SentMessage) .filter(SentMessage.normalized_flickr_url != "") .filter(~SentMessage.subject.startswith("Re:")) .all() ) url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages} print(f"Sent message index: {len(url_to_message)} entries") # Query contributions with flickr.com in comment contributions = ( session.query(Contribution) .filter(Contribution.comment.ilike("%flickr.com%")) .all() ) print(f"Found {len(contributions)} contributions mentioning flickr.com") new_count = 0 for contrib in contributions: if contrib.revid in existing_revids: continue flickr_url = extract_flickr_url(contrib.comment or "") if not flickr_url: continue creator = extract_creator(contrib.comment or "") normalized = normalize_flickr_url(flickr_url) # Look up sent message for FK linking msg = url_to_message.get(normalized) if normalized else None session.add(FlickrUpload( pageid=contrib.pageid, revid=contrib.revid, title=contrib.title, timestamp=contrib.timestamp, flickr_url=flickr_url, normalized_flickr_url=normalized, creator=creator, wikipedia_url=msg.wikipedia_url if msg else "", creator_profile_url=msg.creator_profile_url if msg else "", sent_message_id=msg.message_id if msg else None, )) new_count += 1 session.commit() total = session.query(FlickrUpload).count() linked = session.query(FlickrUpload).filter( FlickrUpload.sent_message_id.isnot(None) ).count() print(f"Extracted {new_count} new Flickr uploads") print(f"Total: {total} uploads, {linked} linked to sent messages") # Show some stats with_creator = session.query(FlickrUpload).filter( FlickrUpload.creator.isnot(None) ).count() print(f" - {with_creator} with creator identified") print(f" - {total - with_creator} without creator") except Exception: session.rollback() raise finally: session.close() if __name__ == '__main__': main()