Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ac1b01ea68
commit
9f0fb01878
11 changed files with 1129 additions and 300 deletions
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find UploadWizard contributions that are from Flickr and add them to flickr_uploads.json.
|
||||
Find UploadWizard contributions that are from Flickr and add them to the database.
|
||||
|
||||
For contributions with comment 'User created page with UploadWizard', queries the
|
||||
Commons API to check if the image source is Flickr (by checking the Credit field).
|
||||
|
|
@ -9,12 +9,13 @@ Commons API to check if the image source is Flickr (by checking the Credit field
|
|||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
CONTRIBUTIONS_FILE = Path("commons_contributions/contributions.json")
|
||||
FLICKR_UPLOADS_FILE = Path("commons_contributions/flickr_uploads.json")
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage
|
||||
from flickr_mail.url_utils import normalize_flickr_url
|
||||
|
||||
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
|
||||
USER_AGENT = "FlickrMail/1.0 (https://edwardbetts.com/flickr_mail/; edward@4angle.com)"
|
||||
|
||||
|
|
@ -75,99 +76,101 @@ def clean_artist_name(artist_html: str) -> str:
|
|||
|
||||
|
||||
def main():
|
||||
# Load contributions
|
||||
print("Loading contributions...")
|
||||
with open(CONTRIBUTIONS_FILE) as f:
|
||||
data = json.load(f)
|
||||
init_db()
|
||||
session = get_session()
|
||||
|
||||
contributions = data.get("contributions", [])
|
||||
try:
|
||||
# Get existing normalized flickr URLs to avoid duplicates
|
||||
existing_urls = {
|
||||
r[0] for r in session.query(FlickrUpload.normalized_flickr_url).all()
|
||||
if r[0]
|
||||
}
|
||||
print(f"Existing uploads: {session.query(FlickrUpload).count()}")
|
||||
print(f"Existing flickr URLs: {len(existing_urls)}")
|
||||
|
||||
# Load existing flickr uploads
|
||||
existing_flickr_urls = set()
|
||||
existing_uploads = []
|
||||
if FLICKR_UPLOADS_FILE.exists():
|
||||
with open(FLICKR_UPLOADS_FILE) as f:
|
||||
existing_uploads = json.load(f)
|
||||
existing_flickr_urls = {u.get("flickr_url", "") for u in existing_uploads}
|
||||
# Also normalize existing URLs for comparison
|
||||
for u in existing_uploads:
|
||||
url = u.get("flickr_url", "")
|
||||
normalized = url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
existing_flickr_urls.add(normalized)
|
||||
# Build sent message index for FK linking
|
||||
sent_messages = (
|
||||
session.query(SentMessage)
|
||||
.filter(SentMessage.normalized_flickr_url != "")
|
||||
.filter(~SentMessage.subject.startswith("Re:"))
|
||||
.all()
|
||||
)
|
||||
url_to_message = {msg.normalized_flickr_url: msg for msg in sent_messages}
|
||||
|
||||
print(f"Existing uploads: {len(existing_uploads)}")
|
||||
print(f"Existing flickr URLs: {len(existing_flickr_urls)}")
|
||||
# Find UploadWizard contributions (page creations only)
|
||||
upload_wizard = (
|
||||
session.query(Contribution)
|
||||
.filter(Contribution.comment == "User created page with UploadWizard")
|
||||
.filter(Contribution.title.startswith("File:"))
|
||||
.all()
|
||||
)
|
||||
|
||||
# Find UploadWizard contributions (page creations only)
|
||||
upload_wizard_contributions = []
|
||||
for c in contributions:
|
||||
comment = c.get("comment", "")
|
||||
if comment == "User created page with UploadWizard":
|
||||
# Only include if it's a File: page
|
||||
title = c.get("title", "")
|
||||
if title.startswith("File:"):
|
||||
upload_wizard_contributions.append(c)
|
||||
print(f"UploadWizard contributions to check: {len(upload_wizard)}")
|
||||
|
||||
print(f"UploadWizard contributions to check: {len(upload_wizard_contributions)}")
|
||||
# Process in batches of 50
|
||||
new_count = 0
|
||||
batch_size = 50
|
||||
|
||||
# Process in batches of 50
|
||||
new_uploads = []
|
||||
batch_size = 50
|
||||
for i in range(0, len(upload_wizard), batch_size):
|
||||
batch = upload_wizard[i : i + batch_size]
|
||||
titles = [c.title for c in batch]
|
||||
|
||||
for i in range(0, len(upload_wizard_contributions), batch_size):
|
||||
batch = upload_wizard_contributions[i : i + batch_size]
|
||||
titles = [c["title"] for c in batch]
|
||||
print(
|
||||
f"Processing batch {i // batch_size + 1}/"
|
||||
f"{(len(upload_wizard) + batch_size - 1) // batch_size}..."
|
||||
)
|
||||
|
||||
print(f"Processing batch {i // batch_size + 1}/{(len(upload_wizard_contributions) + batch_size - 1) // batch_size}...")
|
||||
metadata = get_image_metadata(titles)
|
||||
|
||||
metadata = get_image_metadata(titles)
|
||||
for c in batch:
|
||||
meta = metadata.get(c.title, {})
|
||||
credit = meta.get("credit", "")
|
||||
artist = meta.get("artist", "")
|
||||
|
||||
for c in batch:
|
||||
title = c["title"]
|
||||
meta = metadata.get(title, {})
|
||||
credit = meta.get("credit", "")
|
||||
artist = meta.get("artist", "")
|
||||
flickr_url = extract_flickr_url_from_credit(credit)
|
||||
if not flickr_url:
|
||||
continue
|
||||
|
||||
flickr_url = extract_flickr_url_from_credit(credit)
|
||||
if not flickr_url:
|
||||
continue
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
if normalized in existing_urls:
|
||||
continue
|
||||
|
||||
# Check if we already have this URL
|
||||
normalized = flickr_url.replace("https://", "").replace("http://", "").replace("www.", "").rstrip("/")
|
||||
if normalized in existing_flickr_urls or flickr_url in existing_flickr_urls:
|
||||
continue
|
||||
creator = clean_artist_name(artist) if artist else None
|
||||
|
||||
creator = clean_artist_name(artist) if artist else None
|
||||
# Look up sent message for FK linking
|
||||
msg = url_to_message.get(normalized) if normalized else None
|
||||
|
||||
new_upload = {
|
||||
"pageid": c["pageid"],
|
||||
"revid": c["revid"],
|
||||
"title": title,
|
||||
"timestamp": c["timestamp"],
|
||||
"flickr_url": flickr_url,
|
||||
"creator": creator,
|
||||
}
|
||||
session.add(FlickrUpload(
|
||||
pageid=c.pageid,
|
||||
revid=c.revid,
|
||||
title=c.title,
|
||||
timestamp=c.timestamp,
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
creator=creator,
|
||||
wikipedia_url=msg.wikipedia_url if msg else "",
|
||||
creator_profile_url=msg.creator_profile_url if msg else "",
|
||||
sent_message_id=msg.message_id if msg else None,
|
||||
))
|
||||
new_count += 1
|
||||
existing_urls.add(normalized)
|
||||
print(f" Found: {c.title[:50]} -> {flickr_url}")
|
||||
|
||||
new_uploads.append(new_upload)
|
||||
existing_flickr_urls.add(normalized)
|
||||
print(f" Found: {title[:50]} -> {flickr_url}")
|
||||
session.commit()
|
||||
|
||||
# Rate limiting
|
||||
if i + batch_size < len(upload_wizard_contributions):
|
||||
time.sleep(0.5)
|
||||
# Rate limiting
|
||||
if i + batch_size < len(upload_wizard):
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\nFound {len(new_uploads)} new Flickr uploads")
|
||||
total = session.query(FlickrUpload).count()
|
||||
print(f"\nFound {new_count} new Flickr uploads")
|
||||
print(f"Total: {total} uploads in database")
|
||||
|
||||
if new_uploads:
|
||||
# Merge and sort by timestamp (newest first)
|
||||
all_uploads = existing_uploads + new_uploads
|
||||
all_uploads.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
|
||||
# Save
|
||||
with open(FLICKR_UPLOADS_FILE, "w") as f:
|
||||
json.dump(all_uploads, f, indent=2)
|
||||
|
||||
print(f"Saved {len(all_uploads)} total uploads to {FLICKR_UPLOADS_FILE}")
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue