Extract flickr_mail package with Mapped models and shared utilities

Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 13:07:23 +00:00 · 2026-02-07 13:07:23 +00:00 · 9f0fb01878
commit 9f0fb01878
parent ac1b01ea68
11 changed files with 1129 additions and 300 deletions
--- a/extract_flickr_uploads.py
+++ b/extract_flickr_uploads.py
@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+Extract Flickr uploads from Wikimedia Commons contributions.
+
+Filters contributions where the comment contains a flickr.com URL and extracts:
+- pageid, revid, title, timestamp
+- flickr_url: the Flickr photo URL
+- creator: the photographer/author name
+
+Links uploads to sent messages via normalized Flickr URL matching.
+"""
+
+import re
+
+from flickr_mail.database import init_db, get_session
+from flickr_mail.models import Contribution, FlickrUpload, SentMessage
+from flickr_mail.url_utils import normalize_flickr_url
+
+
+def extract_flickr_url(comment: str) -> str | None:
+    """Extract the Flickr photo URL from a comment."""
+    # Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
+    # Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
+    patterns = [
+        # Plain URL (modern format)
+        r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
+        # URL in wiki markup [url title]
+        r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, comment)
+        if match:
+            return match.group(1)
+
+    return None
+
+
+def extract_creator(comment: str) -> str | None:
+    """Extract the creator/author name from a comment."""
+    # Modern format: "Uploaded a work by {creator} from https://..."
+    modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
+    if modern_match:
+        return modern_match.group(1).strip()
+
+    # Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
+    # The author name comes after the URL, before ] or "from"
+    author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
+    if author_match:
+        author = author_match.group(1).strip()
+        # Remove trailing location like "from Toronto, Canada"
+        author = re.sub(r'\s+from\s+.+$', '', author)
+        return author
+
+    # Handle truncated comments where Author field is cut off
+    # Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
+    truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
+    if truncated_match:
+        author = truncated_match.group(1).strip()
+        if author:
+            return author
+
+    # Sometimes Author field is just plain text without URL
+    author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
+    if author_plain:
+        author = author_plain.group(1).strip()
+        # Skip if it looks like a wiki user link
+        if not author.startswith('[[User:') and author:
+            return author
+
+    return None
+
+
+def main() -> None:
+    """Process contributions and extract Flickr uploads."""
+    init_db()
+    session = get_session()
+
+    try:
+        # Get existing upload revids to avoid duplicates
+        existing_revids = {
+            r[0] for r in session.query(FlickrUpload.revid).all()
+        }
+
+        # Build sent message index: normalized_flickr_url -> message
+        sent_messages = (
+            session.query(SentMessage)
+            .filter(SentMessage.normalized_flickr_url != "")
+            .filter(~SentMessage.subject.startswith("Re:"))
+            .all()
+        )
+        url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
+        print(f"Sent message index: {len(url_to_message)} entries")
+
+        # Query contributions with flickr.com in comment
+        contributions = (
+            session.query(Contribution)
+            .filter(Contribution.comment.ilike("%flickr.com%"))
+            .all()
+        )
+
+        print(f"Found {len(contributions)} contributions mentioning flickr.com")
+
+        new_count = 0
+        for contrib in contributions:
+            if contrib.revid in existing_revids:
+                continue
+
+            flickr_url = extract_flickr_url(contrib.comment or "")
+            if not flickr_url:
+                continue
+
+            creator = extract_creator(contrib.comment or "")
+            normalized = normalize_flickr_url(flickr_url)
+
+            # Look up sent message for FK linking
+            msg = url_to_message.get(normalized) if normalized else None
+
+            session.add(FlickrUpload(
+                pageid=contrib.pageid,
+                revid=contrib.revid,
+                title=contrib.title,
+                timestamp=contrib.timestamp,
+                flickr_url=flickr_url,
+                normalized_flickr_url=normalized,
+                creator=creator,
+                wikipedia_url=msg.wikipedia_url if msg else "",
+                creator_profile_url=msg.creator_profile_url if msg else "",
+                sent_message_id=msg.message_id if msg else None,
+            ))
+            new_count += 1
+
+        session.commit()
+
+        total = session.query(FlickrUpload).count()
+        linked = session.query(FlickrUpload).filter(
+            FlickrUpload.sent_message_id.isnot(None)
+        ).count()
+
+        print(f"Extracted {new_count} new Flickr uploads")
+        print(f"Total: {total} uploads, {linked} linked to sent messages")
+
+        # Show some stats
+        with_creator = session.query(FlickrUpload).filter(
+            FlickrUpload.creator.isnot(None)
+        ).count()
+        print(f"  - {with_creator} with creator identified")
+        print(f"  - {total - with_creator} without creator")
+
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        session.close()
+
+
+if __name__ == '__main__':
+    main()