#!/usr/bin/env python3 """One-time migration from JSON files to SQLite database.""" import json import time from pathlib import Path from flickr_mail.database import init_db, get_session from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache from flickr_mail.url_utils import ( creator_profile_from_flickr_url, extract_urls_from_message, normalize_flickr_url, ) COMMONS_DIR = Path(__file__).parent / "commons_contributions" SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages" SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json" CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json" FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json" THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json" def migrate_contributions(session) -> int: """Migrate contributions.json to contributions table.""" if not CONTRIBUTIONS_FILE.exists(): print("No contributions.json found, skipping") return 0 with open(CONTRIBUTIONS_FILE) as f: data = json.load(f) contributions = data.get("contributions", []) print(f"Migrating {len(contributions)} contributions...") for c in contributions: session.add(Contribution( userid=c.get("userid"), user=c.get("user"), pageid=c.get("pageid"), revid=c.get("revid"), parentid=c.get("parentid"), ns=c.get("ns"), title=c.get("title"), timestamp=c.get("timestamp"), minor=c.get("minor"), top=c.get("top"), comment=c.get("comment"), size=c.get("size"), sizediff=c.get("sizediff"), tags=json.dumps(c.get("tags", [])), )) session.flush() count = session.query(Contribution).count() print(f" -> {count} contributions migrated") return count def migrate_sent_messages(session) -> dict[str, str]: """Migrate sent messages to sent_messages table. Returns a dict of normalized_flickr_url -> message_id for FK linking. """ if not SENT_MAIL_INDEX.exists(): print("No messages_index.json found, skipping") return {} with open(SENT_MAIL_INDEX) as f: index = json.load(f) print(f"Migrating {len(index)} sent messages...") url_to_message_id: dict[str, str] = {} count = 0 for msg_meta in index: msg_id = msg_meta.get("message_id", "") if not msg_id: continue # Load the full message from individual file msg_file = SENT_MAIL_DIR / f"{msg_id}.json" if msg_file.exists(): with open(msg_file) as f: msg = json.load(f) else: msg = msg_meta body = msg.get("body", "") subject = msg.get("subject", "") # Extract URLs from body flickr_url, wikipedia_url = extract_urls_from_message(body) normalized = normalize_flickr_url(flickr_url) if flickr_url else "" # Extract creator profile URL creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else "" session.add(SentMessage( message_id=msg_id, subject=msg.get("subject", ""), url=msg.get("url", ""), recipient=msg.get("recipient", ""), date=msg.get("date", ""), body=body, body_html=msg.get("body_html", ""), flickr_url=flickr_url, normalized_flickr_url=normalized, wikipedia_url=wikipedia_url, creator_profile_url=creator_profile_url, )) # Build URL -> message_id map for FK linking (skip replies) if normalized and not subject.startswith("Re:"): url_to_message_id[normalized] = msg_id count += 1 session.flush() actual = session.query(SentMessage).count() print(f" -> {actual} sent messages migrated") print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking") return url_to_message_id def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int: """Migrate flickr_uploads.json to flickr_uploads table with FK linking.""" if not FLICKR_UPLOADS_FILE.exists(): print("No flickr_uploads.json found, skipping") return 0 with open(FLICKR_UPLOADS_FILE) as f: uploads = json.load(f) print(f"Migrating {len(uploads)} flickr uploads...") linked = 0 for u in uploads: flickr_url = u.get("flickr_url", "") normalized = normalize_flickr_url(flickr_url) # Look up sent message FK sent_message_id = url_to_message_id.get(normalized) if normalized else None if sent_message_id: linked += 1 # Get wikipedia_url and creator_profile_url from the linked message wikipedia_url = "" creator_profile_url = "" if sent_message_id: msg = session.get(SentMessage, sent_message_id) if msg: wikipedia_url = msg.wikipedia_url or "" creator_profile_url = msg.creator_profile_url or "" session.add(FlickrUpload( pageid=u.get("pageid"), revid=u.get("revid"), title=u.get("title"), timestamp=u.get("timestamp"), flickr_url=flickr_url, normalized_flickr_url=normalized, creator=u.get("creator"), wikipedia_url=wikipedia_url, creator_profile_url=creator_profile_url, sent_message_id=sent_message_id, )) session.flush() count = session.query(FlickrUpload).count() print(f" -> {count} flickr uploads migrated") print(f" -> {linked} linked to sent messages") return count def migrate_thumbnail_cache(session) -> int: """Migrate thumbnail_cache.json to thumbnail_cache table.""" if not THUMBNAIL_CACHE_FILE.exists(): print("No thumbnail_cache.json found, skipping") return 0 with open(THUMBNAIL_CACHE_FILE) as f: cache = json.load(f) thumbnails = cache.get("thumbnails", {}) cache_timestamp = int(cache.get("timestamp", 0)) print(f"Migrating {len(thumbnails)} cached thumbnails...") for title, thumb_url in thumbnails.items(): session.add(ThumbnailCache( title=title, thumb_url=thumb_url, fetched_at=cache_timestamp, )) session.flush() count = session.query(ThumbnailCache).count() print(f" -> {count} thumbnail cache entries migrated") return count def main() -> None: print("Initializing database...") init_db() session = get_session() try: # Check if already migrated existing = session.query(Contribution).count() if existing > 0: print(f"Database already contains {existing} contributions. Aborting.") print("Delete flickr_mail.db to re-run migration.") return migrate_contributions(session) url_to_message_id = migrate_sent_messages(session) migrate_flickr_uploads(session, url_to_message_id) migrate_thumbnail_cache(session) session.commit() print("\nMigration complete!") except Exception: session.rollback() raise finally: session.close() if __name__ == "__main__": main()