flickr-mail/migrate_json_to_db.py
Edward Betts 9f0fb01878 Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with
Mapped type hints. Deduplicate URL utility functions into shared
flickr_mail package.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 13:10:49 +00:00

233 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""One-time migration from JSON files to SQLite database."""
import json
import time
from pathlib import Path
from flickr_mail.database import init_db, get_session
from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
from flickr_mail.url_utils import (
creator_profile_from_flickr_url,
extract_urls_from_message,
normalize_flickr_url,
)
COMMONS_DIR = Path(__file__).parent / "commons_contributions"
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
def migrate_contributions(session) -> int:
"""Migrate contributions.json to contributions table."""
if not CONTRIBUTIONS_FILE.exists():
print("No contributions.json found, skipping")
return 0
with open(CONTRIBUTIONS_FILE) as f:
data = json.load(f)
contributions = data.get("contributions", [])
print(f"Migrating {len(contributions)} contributions...")
for c in contributions:
session.add(Contribution(
userid=c.get("userid"),
user=c.get("user"),
pageid=c.get("pageid"),
revid=c.get("revid"),
parentid=c.get("parentid"),
ns=c.get("ns"),
title=c.get("title"),
timestamp=c.get("timestamp"),
minor=c.get("minor"),
top=c.get("top"),
comment=c.get("comment"),
size=c.get("size"),
sizediff=c.get("sizediff"),
tags=json.dumps(c.get("tags", [])),
))
session.flush()
count = session.query(Contribution).count()
print(f" -> {count} contributions migrated")
return count
def migrate_sent_messages(session) -> dict[str, str]:
"""Migrate sent messages to sent_messages table.
Returns a dict of normalized_flickr_url -> message_id for FK linking.
"""
if not SENT_MAIL_INDEX.exists():
print("No messages_index.json found, skipping")
return {}
with open(SENT_MAIL_INDEX) as f:
index = json.load(f)
print(f"Migrating {len(index)} sent messages...")
url_to_message_id: dict[str, str] = {}
count = 0
for msg_meta in index:
msg_id = msg_meta.get("message_id", "")
if not msg_id:
continue
# Load the full message from individual file
msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
if msg_file.exists():
with open(msg_file) as f:
msg = json.load(f)
else:
msg = msg_meta
body = msg.get("body", "")
subject = msg.get("subject", "")
# Extract URLs from body
flickr_url, wikipedia_url = extract_urls_from_message(body)
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
# Extract creator profile URL
creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
session.add(SentMessage(
message_id=msg_id,
subject=msg.get("subject", ""),
url=msg.get("url", ""),
recipient=msg.get("recipient", ""),
date=msg.get("date", ""),
body=body,
body_html=msg.get("body_html", ""),
flickr_url=flickr_url,
normalized_flickr_url=normalized,
wikipedia_url=wikipedia_url,
creator_profile_url=creator_profile_url,
))
# Build URL -> message_id map for FK linking (skip replies)
if normalized and not subject.startswith("Re:"):
url_to_message_id[normalized] = msg_id
count += 1
session.flush()
actual = session.query(SentMessage).count()
print(f" -> {actual} sent messages migrated")
print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
return url_to_message_id
def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
"""Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
if not FLICKR_UPLOADS_FILE.exists():
print("No flickr_uploads.json found, skipping")
return 0
with open(FLICKR_UPLOADS_FILE) as f:
uploads = json.load(f)
print(f"Migrating {len(uploads)} flickr uploads...")
linked = 0
for u in uploads:
flickr_url = u.get("flickr_url", "")
normalized = normalize_flickr_url(flickr_url)
# Look up sent message FK
sent_message_id = url_to_message_id.get(normalized) if normalized else None
if sent_message_id:
linked += 1
# Get wikipedia_url and creator_profile_url from the linked message
wikipedia_url = ""
creator_profile_url = ""
if sent_message_id:
msg = session.get(SentMessage, sent_message_id)
if msg:
wikipedia_url = msg.wikipedia_url or ""
creator_profile_url = msg.creator_profile_url or ""
session.add(FlickrUpload(
pageid=u.get("pageid"),
revid=u.get("revid"),
title=u.get("title"),
timestamp=u.get("timestamp"),
flickr_url=flickr_url,
normalized_flickr_url=normalized,
creator=u.get("creator"),
wikipedia_url=wikipedia_url,
creator_profile_url=creator_profile_url,
sent_message_id=sent_message_id,
))
session.flush()
count = session.query(FlickrUpload).count()
print(f" -> {count} flickr uploads migrated")
print(f" -> {linked} linked to sent messages")
return count
def migrate_thumbnail_cache(session) -> int:
"""Migrate thumbnail_cache.json to thumbnail_cache table."""
if not THUMBNAIL_CACHE_FILE.exists():
print("No thumbnail_cache.json found, skipping")
return 0
with open(THUMBNAIL_CACHE_FILE) as f:
cache = json.load(f)
thumbnails = cache.get("thumbnails", {})
cache_timestamp = int(cache.get("timestamp", 0))
print(f"Migrating {len(thumbnails)} cached thumbnails...")
for title, thumb_url in thumbnails.items():
session.add(ThumbnailCache(
title=title,
thumb_url=thumb_url,
fetched_at=cache_timestamp,
))
session.flush()
count = session.query(ThumbnailCache).count()
print(f" -> {count} thumbnail cache entries migrated")
return count
def main() -> None:
print("Initializing database...")
init_db()
session = get_session()
try:
# Check if already migrated
existing = session.query(Contribution).count()
if existing > 0:
print(f"Database already contains {existing} contributions. Aborting.")
print("Delete flickr_mail.db to re-run migration.")
return
migrate_contributions(session)
url_to_message_id = migrate_sent_messages(session)
migrate_flickr_uploads(session, url_to_message_id)
migrate_thumbnail_cache(session)
session.commit()
print("\nMigration complete!")
except Exception:
session.rollback()
raise
finally:
session.close()
if __name__ == "__main__":
main()