Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ac1b01ea68
commit
9f0fb01878
11 changed files with 1129 additions and 300 deletions
158
extract_flickr_uploads.py
Normal file
158
extract_flickr_uploads.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Flickr uploads from Wikimedia Commons contributions.
|
||||
|
||||
Filters contributions where the comment contains a flickr.com URL and extracts:
|
||||
- pageid, revid, title, timestamp
|
||||
- flickr_url: the Flickr photo URL
|
||||
- creator: the photographer/author name
|
||||
|
||||
Links uploads to sent messages via normalized Flickr URL matching.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
||||
from flickr_mail.url_utils import normalize_flickr_url
|
||||
|
||||
|
||||
def extract_flickr_url(comment: str) -> str | None:
|
||||
"""Extract the Flickr photo URL from a comment."""
|
||||
# Match URLs like https://www.flickr.com/photos/user/12345/ or http://www.flickr.com/photos/user/12345/
|
||||
# Also handles [http://www.flickr.com/photos/user/12345/ title] wiki markup
|
||||
patterns = [
|
||||
# Plain URL (modern format)
|
||||
r'(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?',
|
||||
# URL in wiki markup [url title]
|
||||
r'\[(https?://(?:www\.)?flickr\.com/photos/[^/\s\]]+/\d+)/?[^\]]*\]',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, comment)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_creator(comment: str) -> str | None:
|
||||
"""Extract the creator/author name from a comment."""
|
||||
# Modern format: "Uploaded a work by {creator} from https://..."
|
||||
modern_match = re.search(r'Uploaded a work by (.+?) from https?://', comment)
|
||||
if modern_match:
|
||||
return modern_match.group(1).strip()
|
||||
|
||||
# Old {{Information}} format: |Author=[http://www.flickr.com/people/... AuthorName] or |Author=[http://... AuthorName] from Location
|
||||
# The author name comes after the URL, before ] or "from"
|
||||
author_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]]+)\]', comment)
|
||||
if author_match:
|
||||
author = author_match.group(1).strip()
|
||||
# Remove trailing location like "from Toronto, Canada"
|
||||
author = re.sub(r'\s+from\s+.+$', '', author)
|
||||
return author
|
||||
|
||||
# Handle truncated comments where Author field is cut off
|
||||
# Pattern: |Author=[http://...flickr.com/people/... AuthorName (may be incomplete)
|
||||
truncated_match = re.search(r'\|Author=\[https?://[^\s\]]+ ([^\]\|]+)$', comment)
|
||||
if truncated_match:
|
||||
author = truncated_match.group(1).strip()
|
||||
if author:
|
||||
return author
|
||||
|
||||
# Sometimes Author field is just plain text without URL
|
||||
author_plain = re.search(r'\|Author=([^\|\}\[\]]+?)(?:\r?\n|\|)', comment)
|
||||
if author_plain:
|
||||
author = author_plain.group(1).strip()
|
||||
# Skip if it looks like a wiki user link
|
||||
if not author.startswith('[[User:') and author:
|
||||
return author
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Process contributions and extract Flickr uploads."""
|
||||
init_db()
|
||||
session = get_session()
|
||||
|
||||
try:
|
||||
# Get existing upload revids to avoid duplicates
|
||||
existing_revids = {
|
||||
r[0] for r in session.query(FlickrUpload.revid).all()
|
||||
}
|
||||
|
||||
# Build sent message index: normalized_flickr_url -> message
|
||||
sent_messages = (
|
||||
session.query(SentMessage)
|
||||
.filter(SentMessage.normalized_flickr_url != "")
|
||||
.filter(~SentMessage.subject.startswith("Re:"))
|
||||
.all()
|
||||
)
|
||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||
print(f"Sent message index: {len(url_to_message)} entries")
|
||||
|
||||
# Query contributions with flickr.com in comment
|
||||
contributions = (
|
||||
session.query(Contribution)
|
||||
.filter(Contribution.comment.ilike("%flickr.com%"))
|
||||
.all()
|
||||
)
|
||||
|
||||
print(f"Found {len(contributions)} contributions mentioning flickr.com")
|
||||
|
||||
new_count = 0
|
||||
for contrib in contributions:
|
||||
if contrib.revid in existing_revids:
|
||||
continue
|
||||
|
||||
flickr_url = extract_flickr_url(contrib.comment or "")
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
creator = extract_creator(contrib.comment or "")
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
|
||||
# Look up sent message for FK linking
|
||||
msg = url_to_message.get(normalized) if normalized else None
|
||||
|
||||
session.add(FlickrUpload(
|
||||
pageid=contrib.pageid,
|
||||
revid=contrib.revid,
|
||||
title=contrib.title,
|
||||
timestamp=contrib.timestamp,
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
creator=creator,
|
||||
wikipedia_url=msg.wikipedia_url if msg else "",
|
||||
creator_profile_url=msg.creator_profile_url if msg else "",
|
||||
sent_message_id=msg.message_id if msg else None,
|
||||
))
|
||||
new_count += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
total = session.query(FlickrUpload).count()
|
||||
linked = session.query(FlickrUpload).filter(
|
||||
FlickrUpload.sent_message_id.isnot(None)
|
||||
).count()
|
||||
|
||||
print(f"Extracted {new_count} new Flickr uploads")
|
||||
print(f"Total: {total} uploads, {linked} linked to sent messages")
|
||||
|
||||
# Show some stats
|
||||
with_creator = session.query(FlickrUpload).filter(
|
||||
FlickrUpload.creator.isnot(None)
|
||||
).count()
|
||||
print(f" - {with_creator} with creator identified")
|
||||
print(f" - {total - with_creator} without creator")
|
||||
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue