Extract flickr_mail package with Mapped models and shared utilities
Move from JSON file storage to SQLite database using SQLAlchemy with Mapped type hints. Deduplicate URL utility functions into shared flickr_mail package. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ac1b01ea68
commit
9f0fb01878
11 changed files with 1129 additions and 300 deletions
233
migrate_json_to_db.py
Normal file
233
migrate_json_to_db.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
#!/usr/bin/env python3
|
||||
"""One-time migration from JSON files to SQLite database."""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from flickr_mail.database import init_db, get_session
|
||||
from flickr_mail.models import Contribution, FlickrUpload, SentMessage, ThumbnailCache
|
||||
from flickr_mail.url_utils import (
|
||||
creator_profile_from_flickr_url,
|
||||
extract_urls_from_message,
|
||||
normalize_flickr_url,
|
||||
)
|
||||
|
||||
COMMONS_DIR = Path(__file__).parent / "commons_contributions"
|
||||
SENT_MAIL_DIR = Path(__file__).parent / "sent_mail" / "messages"
|
||||
SENT_MAIL_INDEX = Path(__file__).parent / "sent_mail" / "messages_index.json"
|
||||
CONTRIBUTIONS_FILE = COMMONS_DIR / "contributions.json"
|
||||
FLICKR_UPLOADS_FILE = COMMONS_DIR / "flickr_uploads.json"
|
||||
THUMBNAIL_CACHE_FILE = COMMONS_DIR / "thumbnail_cache.json"
|
||||
|
||||
|
||||
def migrate_contributions(session) -> int:
|
||||
"""Migrate contributions.json to contributions table."""
|
||||
if not CONTRIBUTIONS_FILE.exists():
|
||||
print("No contributions.json found, skipping")
|
||||
return 0
|
||||
|
||||
with open(CONTRIBUTIONS_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
contributions = data.get("contributions", [])
|
||||
print(f"Migrating {len(contributions)} contributions...")
|
||||
|
||||
for c in contributions:
|
||||
session.add(Contribution(
|
||||
userid=c.get("userid"),
|
||||
user=c.get("user"),
|
||||
pageid=c.get("pageid"),
|
||||
revid=c.get("revid"),
|
||||
parentid=c.get("parentid"),
|
||||
ns=c.get("ns"),
|
||||
title=c.get("title"),
|
||||
timestamp=c.get("timestamp"),
|
||||
minor=c.get("minor"),
|
||||
top=c.get("top"),
|
||||
comment=c.get("comment"),
|
||||
size=c.get("size"),
|
||||
sizediff=c.get("sizediff"),
|
||||
tags=json.dumps(c.get("tags", [])),
|
||||
))
|
||||
|
||||
session.flush()
|
||||
count = session.query(Contribution).count()
|
||||
print(f" -> {count} contributions migrated")
|
||||
return count
|
||||
|
||||
|
||||
def migrate_sent_messages(session) -> dict[str, str]:
|
||||
"""Migrate sent messages to sent_messages table.
|
||||
|
||||
Returns a dict of normalized_flickr_url -> message_id for FK linking.
|
||||
"""
|
||||
if not SENT_MAIL_INDEX.exists():
|
||||
print("No messages_index.json found, skipping")
|
||||
return {}
|
||||
|
||||
with open(SENT_MAIL_INDEX) as f:
|
||||
index = json.load(f)
|
||||
|
||||
print(f"Migrating {len(index)} sent messages...")
|
||||
|
||||
url_to_message_id: dict[str, str] = {}
|
||||
count = 0
|
||||
|
||||
for msg_meta in index:
|
||||
msg_id = msg_meta.get("message_id", "")
|
||||
if not msg_id:
|
||||
continue
|
||||
|
||||
# Load the full message from individual file
|
||||
msg_file = SENT_MAIL_DIR / f"{msg_id}.json"
|
||||
if msg_file.exists():
|
||||
with open(msg_file) as f:
|
||||
msg = json.load(f)
|
||||
else:
|
||||
msg = msg_meta
|
||||
|
||||
body = msg.get("body", "")
|
||||
subject = msg.get("subject", "")
|
||||
|
||||
# Extract URLs from body
|
||||
flickr_url, wikipedia_url = extract_urls_from_message(body)
|
||||
normalized = normalize_flickr_url(flickr_url) if flickr_url else ""
|
||||
|
||||
# Extract creator profile URL
|
||||
creator_profile_url = creator_profile_from_flickr_url(flickr_url) if flickr_url else ""
|
||||
|
||||
session.add(SentMessage(
|
||||
message_id=msg_id,
|
||||
subject=msg.get("subject", ""),
|
||||
url=msg.get("url", ""),
|
||||
recipient=msg.get("recipient", ""),
|
||||
date=msg.get("date", ""),
|
||||
body=body,
|
||||
body_html=msg.get("body_html", ""),
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
wikipedia_url=wikipedia_url,
|
||||
creator_profile_url=creator_profile_url,
|
||||
))
|
||||
|
||||
# Build URL -> message_id map for FK linking (skip replies)
|
||||
if normalized and not subject.startswith("Re:"):
|
||||
url_to_message_id[normalized] = msg_id
|
||||
|
||||
count += 1
|
||||
|
||||
session.flush()
|
||||
actual = session.query(SentMessage).count()
|
||||
print(f" -> {actual} sent messages migrated")
|
||||
print(f" -> {len(url_to_message_id)} unique flickr URLs indexed for FK linking")
|
||||
return url_to_message_id
|
||||
|
||||
|
||||
def migrate_flickr_uploads(session, url_to_message_id: dict[str, str]) -> int:
|
||||
"""Migrate flickr_uploads.json to flickr_uploads table with FK linking."""
|
||||
if not FLICKR_UPLOADS_FILE.exists():
|
||||
print("No flickr_uploads.json found, skipping")
|
||||
return 0
|
||||
|
||||
with open(FLICKR_UPLOADS_FILE) as f:
|
||||
uploads = json.load(f)
|
||||
|
||||
print(f"Migrating {len(uploads)} flickr uploads...")
|
||||
|
||||
linked = 0
|
||||
for u in uploads:
|
||||
flickr_url = u.get("flickr_url", "")
|
||||
normalized = normalize_flickr_url(flickr_url)
|
||||
|
||||
# Look up sent message FK
|
||||
sent_message_id = url_to_message_id.get(normalized) if normalized else None
|
||||
if sent_message_id:
|
||||
linked += 1
|
||||
|
||||
# Get wikipedia_url and creator_profile_url from the linked message
|
||||
wikipedia_url = ""
|
||||
creator_profile_url = ""
|
||||
if sent_message_id:
|
||||
msg = session.get(SentMessage, sent_message_id)
|
||||
if msg:
|
||||
wikipedia_url = msg.wikipedia_url or ""
|
||||
creator_profile_url = msg.creator_profile_url or ""
|
||||
|
||||
session.add(FlickrUpload(
|
||||
pageid=u.get("pageid"),
|
||||
revid=u.get("revid"),
|
||||
title=u.get("title"),
|
||||
timestamp=u.get("timestamp"),
|
||||
flickr_url=flickr_url,
|
||||
normalized_flickr_url=normalized,
|
||||
creator=u.get("creator"),
|
||||
wikipedia_url=wikipedia_url,
|
||||
creator_profile_url=creator_profile_url,
|
||||
sent_message_id=sent_message_id,
|
||||
))
|
||||
|
||||
session.flush()
|
||||
count = session.query(FlickrUpload).count()
|
||||
print(f" -> {count} flickr uploads migrated")
|
||||
print(f" -> {linked} linked to sent messages")
|
||||
return count
|
||||
|
||||
|
||||
def migrate_thumbnail_cache(session) -> int:
|
||||
"""Migrate thumbnail_cache.json to thumbnail_cache table."""
|
||||
if not THUMBNAIL_CACHE_FILE.exists():
|
||||
print("No thumbnail_cache.json found, skipping")
|
||||
return 0
|
||||
|
||||
with open(THUMBNAIL_CACHE_FILE) as f:
|
||||
cache = json.load(f)
|
||||
|
||||
thumbnails = cache.get("thumbnails", {})
|
||||
cache_timestamp = int(cache.get("timestamp", 0))
|
||||
|
||||
print(f"Migrating {len(thumbnails)} cached thumbnails...")
|
||||
|
||||
for title, thumb_url in thumbnails.items():
|
||||
session.add(ThumbnailCache(
|
||||
title=title,
|
||||
thumb_url=thumb_url,
|
||||
fetched_at=cache_timestamp,
|
||||
))
|
||||
|
||||
session.flush()
|
||||
count = session.query(ThumbnailCache).count()
|
||||
print(f" -> {count} thumbnail cache entries migrated")
|
||||
return count
|
||||
|
||||
|
||||
def main() -> None:
|
||||
print("Initializing database...")
|
||||
init_db()
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
# Check if already migrated
|
||||
existing = session.query(Contribution).count()
|
||||
if existing > 0:
|
||||
print(f"Database already contains {existing} contributions. Aborting.")
|
||||
print("Delete flickr_mail.db to re-run migration.")
|
||||
return
|
||||
|
||||
migrate_contributions(session)
|
||||
url_to_message_id = migrate_sent_messages(session)
|
||||
migrate_flickr_uploads(session, url_to_message_id)
|
||||
migrate_thumbnail_cache(session)
|
||||
|
||||
session.commit()
|
||||
print("\nMigration complete!")
|
||||
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue