Extract flickr_mail package with Mapped models and shared utilities

Move from JSON file storage to SQLite database using SQLAlchemy with
Mapped type hints. Deduplicate URL utility functions into shared
flickr_mail package.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-02-07 13:07:23 +00:00
parent ac1b01ea68
commit 9f0fb01878
11 changed files with 1129 additions and 300 deletions

158
extract_flickr_uploads.py Normal file
View file

@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""
Extract Flickr uploads from Wikimedia Commons contributions.
Filters contributions where the comment contains a flickr.com URL and extracts:
- pageid, revid, title, timestamp
- flickr_url: the Flickr photo URL
- creator: the photographer/author name
Links uploads to sent messages via normalized Flickr URL matching.
"""
import re
from flickr_mail.database import init_db, get_session
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
from flickr_mail.url_utils import normalize_flickr_url
def extract_flickr_url(comment: str) -> str | None:
"""Extract the Flickr photo URL from a comment."""
# Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
# Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
patterns = [
# Plain URL (modern format)
r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
# URL in wiki markup [url title]
r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
]
for pattern in patterns:
match = re.search(pattern, comment)
if match:
return match.group(1)
return None
def extract_creator(comment: str) -> str | None:
"""Extract the creator/author name from a comment."""
# Modern format: "Uploaded a work by {creator} from https://..."
modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
if modern_match:
return modern_match.group(1).strip()
# Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
# The author name comes after the URL, before ] or "from"
author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
if author_match:
author = author_match.group(1).strip()
# Remove trailing location like "from Toronto, Canada"
author = re.sub(r'\s+from\s+.+$', '', author)
return author
# Handle truncated comments where Author field is cut off
# Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
if truncated_match:
author = truncated_match.group(1).strip()
if author:
return author
# Sometimes Author field is just plain text without URL
author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
if author_plain:
author = author_plain.group(1).strip()
# Skip if it looks like a wiki user link
if not author.startswith('[[User:') and author:
return author
return None
def main() -> None:
"""Process contributions and extract Flickr uploads."""
init_db()
session = get_session()
try:
# Get existing upload revids to avoid duplicates
existing_revids = {
r[0] for r in session.query(FlickrUpload.revid).all()
}
# Build sent message index: normalized_flickr_url -> message
sent_messages = (
session.query(SentMessage)
.filter(SentMessage.normalized_flickr_url != "")
.filter(~SentMessage.subject.startswith("Re:"))
.all()
)
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
print(f"Sent message index: {len(url_to_message)} entries")
# Query contributions with flickr.com in comment
contributions = (
session.query(Contribution)
.filter(Contribution.comment.ilike("%flickr.com%"))
.all()
)
print(f"Found {len(contributions)} contributions mentioning flickr.com")
new_count = 0
for contrib in contributions:
if contrib.revid in existing_revids:
continue
flickr_url = extract_flickr_url(contrib.comment or "")
if not flickr_url:
continue
creator = extract_creator(contrib.comment or "")
normalized = normalize_flickr_url(flickr_url)
# Look up sent message for FK linking
msg = url_to_message.get(normalized) if normalized else None
session.add(FlickrUpload(
pageid=contrib.pageid,
revid=contrib.revid,
title=contrib.title,
timestamp=contrib.timestamp,
flickr_url=flickr_url,
normalized_flickr_url=normalized,
creator=creator,
wikipedia_url=msg.wikipedia_url if msg else "",
creator_profile_url=msg.creator_profile_url if msg else "",
sent_message_id=msg.message_id if msg else None,
))
new_count += 1
session.commit()
total = session.query(FlickrUpload).count()
linked = session.query(FlickrUpload).filter(
FlickrUpload.sent_message_id.isnot(None)
).count()
print(f"Extracted {new_count} new Flickr uploads")
print(f"Total: {total} uploads, {linked} linked to sent messages")
# Show some stats
with_creator = session.query(FlickrUpload).filter(
FlickrUpload.creator.isnot(None)
).count()
print(f" - {with_creator} with creator identified")
print(f" - {total - with_creator} without creator")
except Exception:
session.rollback()
raise
finally:
session.close()
if __name__ == '__main__':
main()