Add interaction logging and tighten model NOT NULL constraints

Log searches (article/category) and message-generation events to a new interaction_log table, capturing IP address and User-Agent. Also apply NOT NULL constraints to Contribution, SentMessage, FlickrUpload, and ThumbnailCache fields that are always populated, and remove stale continue_token references from category.html. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-08 12:34:04 +00:00 · 2026-02-08 12:34:04 +00:00 · 08f5128e8d
commit 08f5128e8d
parent 252a854e76
5 changed files with 160 additions and 76 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@ -92,6 +92,8 @@ those obtained via Flickr mail requests.
 - `flickr_uploads`: derived table built by `update_flickr_uploads.py` by
  matching Commons uploads to Flickr URLs
 - `thumbnail_cache`: cached Commons API thumbnail URLs (7-day TTL)
+- `interaction_log`: written by the web app to record searches and message
+  generation events (see below)

 **Key functions**:
 - `get_recent_commons_uploads()`: Loads uploads, filters by sent mail match,
@ -115,6 +117,24 @@ Builds/updates `flickr_uploads` from `contributions` and links to
 - Extracts Flickr URL from contribution comment when present
 - Falls back to Commons `extmetadata.Credit` lookup when comment has no URL

+### Interaction Logging (`log_interaction`)
+
+The `log_interaction()` helper writes a row to `interaction_log` on each
+meaningful user action:
+
+- `"search_article"` – user submits a Wikipedia article search (page 1 only,
+  to avoid logging every pagination hit)
+- `"search_category"` – user submits a Wikipedia category search
+- `"generate_message"` – a non-free CC message is generated after clicking a photo
+
+Each row captures: Unix `timestamp`, `interaction_type`, `ip_address`
+(prefers `X-Forwarded-For` for proxy setups), `user_agent`, `query` (article
+title or category name), and optionally `flickr_url` / `wikipedia_url`.
+
+The table is created by `init_db()` (called via `python3 -c "from
+flickr_mail.database import init_db; init_db()"` or any of the maintenance
+scripts). The web app never calls `init_db()` itself.
+
 ### Category Search (`/category` route)

 Finds Wikipedia articles in a category that don't have images.
--- a/README.md
+++ b/README.md
@ -79,6 +79,18 @@ cp download_sent_mail.example.json download_sent_mail.local.json
 Then edit `download_sent_mail.local.json` and set `cookies_str` to your full
 Flickr `Cookie` header value.

+## Interaction Logging
+
+The app logs searches and message generation to the `interaction_log` table:
+
+- `search_article`: when a user searches for a Wikipedia article title (page 1 only)
+- `search_category`: when a user searches a Wikipedia category
+- `generate_message`: when a non-free CC message is generated for a photo
+
+Each row records the timestamp, interaction type, client IP (from
+`X-Forwarded-For` if present), User-Agent, query, and (for message events)
+the Flickr and Wikipedia URLs.
+
 ## Notes

 - `download_commons_contributions.py` uses an overlap window of known-only
--- a/flickr_mail/models.py
+++ b/flickr_mail/models.py
@ -12,20 +12,20 @@ class Contribution(Base):
    __tablename__ = "contributions"

    id: Mapped[int] = mapped_column(primary_key=True)
-    userid: Mapped[int | None]
-    user: Mapped[str | None]
-    pageid: Mapped[int | None]
-    revid: Mapped[int | None] = mapped_column(unique=True)
-    parentid: Mapped[int | None]
-    ns: Mapped[int | None]
-    title: Mapped[str | None]
-    timestamp: Mapped[str | None]
+    userid: Mapped[int]
+    user: Mapped[str]
+    pageid: Mapped[int]
+    revid: Mapped[int] = mapped_column(unique=True)
+    parentid: Mapped[int]
+    ns: Mapped[int]
+    title: Mapped[str]
+    timestamp: Mapped[str]
    minor: Mapped[str | None]
    top: Mapped[str | None]
-    comment: Mapped[str | None] = mapped_column(Text)
-    size: Mapped[int | None]
-    sizediff: Mapped[int | None]
-    tags: Mapped[str | None] = mapped_column(Text)  # JSON array stored as text
+    comment: Mapped[str] = mapped_column(Text)
+    size: Mapped[int]
+    sizediff: Mapped[int]
+    tags: Mapped[str] = mapped_column(Text)  # JSON array stored as text

    __table_args__ = (
        Index("ix_contributions_timestamp", "timestamp"),
@ -37,16 +37,16 @@ class SentMessage(Base):
    __tablename__ = "sent_messages"

    message_id: Mapped[str] = mapped_column(primary_key=True)
-    subject: Mapped[str | None]
-    url: Mapped[str | None]
-    recipient: Mapped[str | None]
-    date: Mapped[str | None]
-    body: Mapped[str | None] = mapped_column(Text)
-    body_html: Mapped[str | None] = mapped_column(Text)
-    flickr_url: Mapped[str | None]
-    normalized_flickr_url: Mapped[str | None]
-    wikipedia_url: Mapped[str | None]
-    creator_profile_url: Mapped[str | None]
+    subject: Mapped[str]
+    url: Mapped[str]
+    recipient: Mapped[str]
+    date: Mapped[str]
+    body: Mapped[str] = mapped_column(Text)
+    body_html: Mapped[str] = mapped_column(Text)
+    flickr_url: Mapped[str]
+    normalized_flickr_url: Mapped[str]
+    wikipedia_url: Mapped[str]
+    creator_profile_url: Mapped[str]

    flickr_uploads: Mapped[list["FlickrUpload"]] = relationship(
        back_populates="sent_message"
@ -62,15 +62,15 @@ class FlickrUpload(Base):
    __tablename__ = "flickr_uploads"

    id: Mapped[int] = mapped_column(primary_key=True)
-    pageid: Mapped[int | None]
-    revid: Mapped[int | None]
-    title: Mapped[str | None]
-    timestamp: Mapped[str | None]
-    flickr_url: Mapped[str | None]
-    normalized_flickr_url: Mapped[str | None]
+    pageid: Mapped[int]
+    revid: Mapped[int]
+    title: Mapped[str]
+    timestamp: Mapped[str]
+    flickr_url: Mapped[str]
+    normalized_flickr_url: Mapped[str]
    creator: Mapped[str | None]
-    wikipedia_url: Mapped[str | None]
-    creator_profile_url: Mapped[str | None]
+    wikipedia_url: Mapped[str]
+    creator_profile_url: Mapped[str]
    sent_message_id: Mapped[str | None] = mapped_column(
        ForeignKey("sent_messages.message_id")
    )
@ -89,5 +89,23 @@ class ThumbnailCache(Base):
    __tablename__ = "thumbnail_cache"

    title: Mapped[str] = mapped_column(primary_key=True)
-    thumb_url: Mapped[str | None]
-    fetched_at: Mapped[int | None]  # Unix timestamp
+    thumb_url: Mapped[str]
+    fetched_at: Mapped[int]  # Unix timestamp
+
+
+class InteractionLog(Base):
+    __tablename__ = "interaction_log"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    timestamp: Mapped[int]  # Unix timestamp
+    interaction_type: Mapped[str]  # "search_article", "search_category", "generate_message"
+    ip_address: Mapped[str | None]
+    user_agent: Mapped[str | None] = mapped_column(Text)
+    query: Mapped[str | None]  # search term or category name
+    flickr_url: Mapped[str | None]
+    wikipedia_url: Mapped[str | None]
+
+    __table_args__ = (
+        Index("ix_interaction_log_timestamp", "timestamp"),
+        Index("ix_interaction_log_type", "interaction_type"),
+    )
--- a/main.py
+++ b/main.py
@ -18,7 +18,7 @@ from sqlalchemy import func
 from werkzeug.debug.tbtools import DebugTraceback

 from flickr_mail.database import get_session
-from flickr_mail.models import FlickrUpload, SentMessage, ThumbnailCache
+from flickr_mail.models import FlickrUpload, InteractionLog, SentMessage, ThumbnailCache
 from flickr_mail.url_utils import extract_urls_from_message, normalize_flickr_url

 import re
@ -378,15 +378,12 @@ def has_content_image(images: list[dict]) -> bool:
    return False


-def get_articles_without_images(
-    category: str, limit: int = 100
-) -> tuple[list[ArticleWithoutImage], str | None]:
+def get_articles_without_images(category: str) -> list[ArticleWithoutImage]:
    """Get articles in a category that don't have images.

    Uses generator=categorymembers with prop=images to efficiently check
-    multiple articles in a single API request.
-
-    Returns a tuple of (articles_list, continue_token).
+    multiple articles in a single API request, following continuation until
+    all category members have been processed.
    """
    params = {
        "action": "query",
@ -394,25 +391,30 @@ def get_articles_without_images(
        "gcmtitle": category,
        "gcmtype": "page",  # Only articles, not subcategories or files
        "gcmnamespace": "0",  # Main namespace only
-        "gcmlimit": str(limit),
+        "gcmlimit": "max",
        "prop": "images",
        "imlimit": "max",  # Need enough to check all pages in batch
        "format": "json",
    }

    headers = {"User-Agent": WIKIMEDIA_USER_AGENT}
+    articles_without_images: list[ArticleWithoutImage] = []
+    continue_token: str | None = None
+
+    while True:
+        request_params = params.copy()
+        if continue_token:
+            request_params["gcmcontinue"] = continue_token

        try:
            response = requests.get(
-            WIKIPEDIA_API, params=params, headers=headers, timeout=30
+                WIKIPEDIA_API, params=request_params, headers=headers, timeout=30
            )
            response.raise_for_status()
            data = response.json()
        except (requests.RequestException, json.JSONDecodeError) as e:
            print(f"Wikipedia API error: {e}")
-        return [], None
-
-    articles_without_images: list[ArticleWithoutImage] = []
+            break

        pages = data.get("query", {}).get("pages", {})
        for page in pages.values():
@ -430,13 +432,13 @@ def get_articles_without_images(
                    ArticleWithoutImage(title=title, pageid=pageid)
                )

+        continue_token = data.get("continue", {}).get("gcmcontinue")
+        if not continue_token:
+            break
+
    # Sort by title for consistent display
    articles_without_images.sort(key=lambda a: a.title)
-
-    # Get continue token if there are more results
-    continue_token = data.get("continue", {}).get("gcmcontinue")
-
-    return articles_without_images, continue_token
+    return articles_without_images


 def is_valid_flickr_image_url(url: str) -> bool:
@ -583,6 +585,33 @@ def parse_flickr_search_results(html: str, page: int = 1) -> SearchResult:
    )


+def log_interaction(
+    interaction_type: str,
+    query: str | None = None,
+    flickr_url: str | None = None,
+    wikipedia_url: str | None = None,
+) -> None:
+    """Log a user interaction to the database."""
+    forwarded_for = flask.request.headers.get("X-Forwarded-For")
+    ip_address = forwarded_for.split(",")[0].strip() if forwarded_for else flask.request.remote_addr
+    user_agent = flask.request.headers.get("User-Agent")
+    session = get_session()
+    try:
+        entry = InteractionLog(
+            timestamp=int(time.time()),
+            interaction_type=interaction_type,
+            ip_address=ip_address,
+            user_agent=user_agent,
+            query=query,
+            flickr_url=flickr_url,
+            wikipedia_url=wikipedia_url,
+        )
+        session.add(entry)
+        session.commit()
+    finally:
+        session.close()
+
+
@app.errorhandler(werkzeug.exceptions.InternalServerError)
 def exception_handler(e: werkzeug.exceptions.InternalServerError) -> tuple[str, int]:
    """Handle exception."""
@ -656,6 +685,8 @@ def start() -> str:
        # Search Flickr for photos
        page = flask.request.args.get("page", 1, type=int)
        page = max(1, page)  # Ensure page is at least 1
+        if page == 1:
+            log_interaction("search_article", query=name, wikipedia_url=wikipedia_url)
        search_result = search_flickr(name, page)
        return flask.render_template(
            "combined.html",
@ -718,6 +749,13 @@ def start() -> str:
            previous_messages=previous_messages,
        )

+    log_interaction(
+        "generate_message",
+        query=name,
+        flickr_url=flickr_url,
+        wikipedia_url=wikipedia_url,
+    )
+
    msg = flask.render_template(
        "message.jinja",
        flickr_url=flickr_url,
@ -768,7 +806,8 @@ def category_search() -> str:
            cat=cat,
        )

-    articles, continue_token = get_articles_without_images(category)
+    log_interaction("search_category", query=category)
+    articles = get_articles_without_images(category)

    # Get the display name (without Category: prefix)
    category_name = category.replace("Category:", "")
@ -779,7 +818,6 @@ def category_search() -> str:
        category=category,
        category_name=category_name,
        articles=articles,
-        continue_token=continue_token,
    )


--- a/templates/category.html
+++ b/templates/category.html
@ -33,7 +33,7 @@
      <h5>Articles without images in <a href="https://en.wikipedia.org/wiki/{{ category | replace(' ', '_') }}" target="_blank">{{ category_name }}</a></h5>

      {% if articles %}
-      <p class="text-muted small">Found {{ articles | length }} article(s) without images{% if continue_token %} (more available){% endif %}</p>
+      <p class="text-muted small">Found {{ articles | length }} article(s) without images</p>

      <div class="list-group">
        {% for article in articles %}
@ -44,10 +44,6 @@
        {% endfor %}
      </div>

-      {% if continue_token %}
-      <p class="text-muted small mt-3">Note: Only showing first batch of results. More articles may be available in this category.</p>
-      {% endif %}
-
      {% else %}
      <div class="alert alert-success mt-3">
        All articles in this category have images!