Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
158 lines
5.5 KiB
Python
158 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract Flickr uploads from Wikimedia Commons contributions.
|
|
|
|
Filters contributions where the comment contains a flickr.com URL and extracts:
|
|
- pageid, revid, title, timestamp
|
|
- flickr_url: the Flickr photo URL
|
|
- creator: the photographer/author name
|
|
|
|
Links uploads to sent messages via normalized Flickr URL matching.
|
|
"""
|
|
|
|
import re
|
|
|
|
from flickr_mail.database import init_db, get_session
|
|
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
|
from flickr_mail.url_utils import normalize_flickr_url
|
|
|
|
|
|
def extract_flickr_url(comment: str) -> str | None:
|
|
"""Extract the Flickr photo URL from a comment."""
|
|
# Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
|
|
# Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
|
|
patterns = [
|
|
# Plain URL (modern format)
|
|
r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
|
|
# URL in wiki markup [url title]
|
|
r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, comment)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
|
|
def extract_creator(comment: str) -> str | None:
|
|
"""Extract the creator/author name from a comment."""
|
|
# Modern format: "Uploaded a work by {creator} from https://..."
|
|
modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
|
|
if modern_match:
|
|
return modern_match.group(1).strip()
|
|
|
|
# Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
|
|
# The author name comes after the URL, before ] or "from"
|
|
author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
|
|
if author_match:
|
|
author = author_match.group(1).strip()
|
|
# Remove trailing location like "from Toronto, Canada"
|
|
author = re.sub(r'\s+from\s+.+$', '', author)
|
|
return author
|
|
|
|
# Handle truncated comments where Author field is cut off
|
|
# Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
|
|
truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
|
|
if truncated_match:
|
|
author = truncated_match.group(1).strip()
|
|
if author:
|
|
return author
|
|
|
|
# Sometimes Author field is just plain text without URL
|
|
author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
|
|
if author_plain:
|
|
author = author_plain.group(1).strip()
|
|
# Skip if it looks like a wiki user link
|
|
if not author.startswith('[[User:') and author:
|
|
return author
|
|
|
|
return None
|
|
|
|
|
|
def main() -> None:
|
|
"""Process contributions and extract Flickr uploads."""
|
|
init_db()
|
|
session = get_session()
|
|
|
|
try:
|
|
# Get existing upload revids to avoid duplicates
|
|
existing_revids = {
|
|
r[0] for r in session.query(FlickrUpload.revid).all()
|
|
}
|
|
|
|
# Build sent message index: normalized_flickr_url -> message
|
|
sent_messages = (
|
|
session.query(SentMessage)
|
|
.filter(SentMessage.normalized_flickr_url != "")
|
|
.filter(~SentMessage.subject.startswith("Re:"))
|
|
.all()
|
|
)
|
|
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
|
print(f"Sent message index: {len(url_to_message)} entries")
|
|
|
|
# Query contributions with flickr.com in comment
|
|
contributions = (
|
|
session.query(Contribution)
|
|
.filter(Contribution.comment.ilike("%flickr.com%"))
|
|
.all()
|
|
)
|
|
|
|
print(f"Found {len(contributions)} contributions mentioning flickr.com")
|
|
|
|
new_count = 0
|
|
for contrib in contributions:
|
|
if contrib.revid in existing_revids:
|
|
continue
|
|
|
|
flickr_url = extract_flickr_url(contrib.comment or "")
|
|
if not flickr_url:
|
|
continue
|
|
|
|
creator = extract_creator(contrib.comment or "")
|
|
normalized = normalize_flickr_url(flickr_url)
|
|
|
|
# Look up sent message for FK linking
|
|
msg = url_to_message.get(normalized) if normalized else None
|
|
|
|
session.add(FlickrUpload(
|
|
pageid=contrib.pageid,
|
|
revid=contrib.revid,
|
|
title=contrib.title,
|
|
timestamp=contrib.timestamp,
|
|
flickr_url=flickr_url,
|
|
normalized_flickr_url=normalized,
|
|
creator=creator,
|
|
wikipedia_url=msg.wikipedia_url if msg else "",
|
|
creator_profile_url=msg.creator_profile_url if msg else "",
|
|
sent_message_id=msg.message_id if msg else None,
|
|
))
|
|
new_count += 1
|
|
|
|
session.commit()
|
|
|
|
total = session.query(FlickrUpload).count()
|
|
linked = session.query(FlickrUpload).filter(
|
|
FlickrUpload.sent_message_id.isnot(None)
|
|
).count()
|
|
|
|
print(f"Extracted {new_count} new Flickr uploads")
|
|
print(f"Total: {total} uploads, {linked} linked to sent messages")
|
|
|
|
# Show some stats
|
|
with_creator = session.query(FlickrUpload).filter(
|
|
FlickrUpload.creator.isnot(None)
|
|
).count()
|
|
print(f" - {with_creator} with creator identified")
|
|
print(f" - {total - with_creator} without creator")
|
|
|
|
except Exception:
|
|
session.rollback()
|
|
raise
|
|
finally:
|
|
session.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|