Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
233 lines
7.2 KiB
Python
233 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""One-time migration from JSON files to SQLite database."""
|
|
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from flickr_mail.database import init_db, get_session
|
|
from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
|
|
from flickr_mail.url_utils import (
|
|
creator_profile_from_flickr_url,
|
|
extract_urls_from_message,
|
|
normalize_flickr_url,
|
|
)
|
|
|
|
COMMONS_DIR = Path(__file__).parent / "commons_contributions"
|
|
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
|
SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
|
CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
|
|
FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
|
|
THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
|
|
|
|
|
|
def migrate_contributions(session) -> int:
|
|
"""Migrate contributions.json to contributions table."""
|
|
if not CONTRIBUTIONS_FILE.exists():
|
|
print("No contributions.json found, skipping")
|
|
return 0
|
|
|
|
with open(CONTRIBUTIONS_FILE) as f:
|
|
data = json.load(f)
|
|
|
|
contributions = data.get("contributions", [])
|
|
print(f"Migrating {len(contributions)} contributions...")
|
|
|
|
for c in contributions:
|
|
session.add(Contribution(
|
|
userid=c.get("userid"),
|
|
user=c.get("user"),
|
|
pageid=c.get("pageid"),
|
|
revid=c.get("revid"),
|
|
parentid=c.get("parentid"),
|
|
ns=c.get("ns"),
|
|
title=c.get("title"),
|
|
timestamp=c.get("timestamp"),
|
|
minor=c.get("minor"),
|
|
top=c.get("top"),
|
|
comment=c.get("comment"),
|
|
size=c.get("size"),
|
|
sizediff=c.get("sizediff"),
|
|
tags=json.dumps(c.get("tags", [])),
|
|
))
|
|
|
|
session.flush()
|
|
count = session.query(Contribution).count()
|
|
print(f" -> {count} contributions migrated")
|
|
return count
|
|
|
|
|
|
def migrate_sent_messages(session) -> dict[str, str]:
|
|
"""Migrate sent messages to sent_messages table.
|
|
|
|
Returns a dict of normalized_flickr_url -> message_id for FK linking.
|
|
"""
|
|
if not SENT_MAIL_INDEX.exists():
|
|
print("No messages_index.json found, skipping")
|
|
return {}
|
|
|
|
with open(SENT_MAIL_INDEX) as f:
|
|
index = json.load(f)
|
|
|
|
print(f"Migrating {len(index)} sent messages...")
|
|
|
|
url_to_message_id: dict[str, str] = {}
|
|
count = 0
|
|
|
|
for msg_meta in index:
|
|
msg_id = msg_meta.get("message_id", "")
|
|
if not msg_id:
|
|
continue
|
|
|
|
# Load the full message from individual file
|
|
msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
|
|
if msg_file.exists():
|
|
with open(msg_file) as f:
|
|
msg = json.load(f)
|
|
else:
|
|
msg = msg_meta
|
|
|
|
body = msg.get("body", "")
|
|
subject = msg.get("subject", "")
|
|
|
|
# Extract URLs from body
|
|
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
|
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
|
|
|
# Extract creator profile URL
|
|
creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
|
|
|
session.add(SentMessage(
|
|
message_id=msg_id,
|
|
subject=msg.get("subject", ""),
|
|
url=msg.get("url", ""),
|
|
recipient=msg.get("recipient", ""),
|
|
date=msg.get("date", ""),
|
|
body=body,
|
|
body_html=msg.get("body_html", ""),
|
|
flickr_url=flickr_url,
|
|
normalized_flickr_url=normalized,
|
|
wikipedia_url=wikipedia_url,
|
|
creator_profile_url=creator_profile_url,
|
|
))
|
|
|
|
# Build URL -> message_id map for FK linking (skip replies)
|
|
if normalized and not subject.startswith("Re:"):
|
|
url_to_message_id[normalized] = msg_id
|
|
|
|
count += 1
|
|
|
|
session.flush()
|
|
actual = session.query(SentMessage).count()
|
|
print(f" -> {actual} sent messages migrated")
|
|
print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
|
|
return url_to_message_id
|
|
|
|
|
|
def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
|
|
"""Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
|
|
if not FLICKR_UPLOADS_FILE.exists():
|
|
print("No flickr_uploads.json found, skipping")
|
|
return 0
|
|
|
|
with open(FLICKR_UPLOADS_FILE) as f:
|
|
uploads = json.load(f)
|
|
|
|
print(f"Migrating {len(uploads)} flickr uploads...")
|
|
|
|
linked = 0
|
|
for u in uploads:
|
|
flickr_url = u.get("flickr_url", "")
|
|
normalized = normalize_flickr_url(flickr_url)
|
|
|
|
# Look up sent message FK
|
|
sent_message_id = url_to_message_id.get(normalized) if normalized else None
|
|
if sent_message_id:
|
|
linked += 1
|
|
|
|
# Get wikipedia_url and creator_profile_url from the linked message
|
|
wikipedia_url = ""
|
|
creator_profile_url = ""
|
|
if sent_message_id:
|
|
msg = session.get(SentMessage, sent_message_id)
|
|
if msg:
|
|
wikipedia_url = msg.wikipedia_url or ""
|
|
creator_profile_url = msg.creator_profile_url or ""
|
|
|
|
session.add(FlickrUpload(
|
|
pageid=u.get("pageid"),
|
|
revid=u.get("revid"),
|
|
title=u.get("title"),
|
|
timestamp=u.get("timestamp"),
|
|
flickr_url=flickr_url,
|
|
normalized_flickr_url=normalized,
|
|
creator=u.get("creator"),
|
|
wikipedia_url=wikipedia_url,
|
|
creator_profile_url=creator_profile_url,
|
|
sent_message_id=sent_message_id,
|
|
))
|
|
|
|
session.flush()
|
|
count = session.query(FlickrUpload).count()
|
|
print(f" -> {count} flickr uploads migrated")
|
|
print(f" -> {linked} linked to sent messages")
|
|
return count
|
|
|
|
|
|
def migrate_thumbnail_cache(session) -> int:
|
|
"""Migrate thumbnail_cache.json to thumbnail_cache table."""
|
|
if not THUMBNAIL_CACHE_FILE.exists():
|
|
print("No thumbnail_cache.json found, skipping")
|
|
return 0
|
|
|
|
with open(THUMBNAIL_CACHE_FILE) as f:
|
|
cache = json.load(f)
|
|
|
|
thumbnails = cache.get("thumbnails", {})
|
|
cache_timestamp = int(cache.get("timestamp", 0))
|
|
|
|
print(f"Migrating {len(thumbnails)} cached thumbnails...")
|
|
|
|
for title, thumb_url in thumbnails.items():
|
|
session.add(ThumbnailCache(
|
|
title=title,
|
|
thumb_url=thumb_url,
|
|
fetched_at=cache_timestamp,
|
|
))
|
|
|
|
session.flush()
|
|
count = session.query(ThumbnailCache).count()
|
|
print(f" -> {count} thumbnail cache entries migrated")
|
|
return count
|
|
|
|
|
|
def main() -> None:
|
|
print("Initializing database...")
|
|
init_db()
|
|
|
|
session = get_session()
|
|
try:
|
|
# Check if already migrated
|
|
existing = session.query(Contribution).count()
|
|
if existing > 0:
|
|
print(f"Database already contains {existing} contributions. Aborting.")
|
|
print("Delete flickr_mail.db to re-run migration.")
|
|
return
|
|
|
|
migrate_contributions(session)
|
|
url_to_message_id = migrate_sent_messages(session)
|
|
migrate_flickr_uploads(session, url_to_message_id)
|
|
migrate_thumbnail_cache(session)
|
|
|
|
session.commit()
|
|
print("\nMigration complete!")
|
|
|
|
except Exception:
|
|
session.rollback()
|
|
raise
|
|
finally:
|
|
session.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|