From 132b881b089c95a4216cd4493b38d3b99ac0fec3 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 15 May 2026 15:18:39 +0000 Subject: [PATCH 1/6] Use url_for for form action instead of hardcoded path Co-Authored-By: Claude Sonnet 4.6 --- templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/index.html b/templates/index.html index c65c911..05d2187 100644 --- a/templates/index.html +++ b/templates/index.html @@ -40,7 +40,7 @@

Geocode to Commons Category

Convert latitude/longitude to a Wikidata item and Wikimedia Commons category.

-
+ From 808efc3cb4a97825aecf5040163478a38fb3641a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 15 May 2026 15:18:47 +0000 Subject: [PATCH 2/6] Remove blocking socket.getfqdn() DNS lookup from request logging Co-Authored-By: Claude Sonnet 4.6 --- lookup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lookup.py b/lookup.py index 76c1745..0759406 100755 --- a/lookup.py +++ b/lookup.py @@ -5,7 +5,6 @@ import inspect import json import random import re -import socket import sys import traceback import typing @@ -385,7 +384,7 @@ def index() -> str | Response: lat=lat, lon=lon, remote_addr=remote_addr, - fqdn=socket.getfqdn(remote_addr) if remote_addr else None, + fqdn=None, result=result, response_time_ms=int((time() - t0) * 1000), ) From d75d617d1115b33db81b25ca73ed2228ca9c90b9 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 15 May 2026 15:18:55 +0000 Subject: [PATCH 3/6] Make mail sending resilient: optional MAIL_FROM_NAME config and swallow send errors Co-Authored-By: Claude Sonnet 4.6 --- geocode/mail.py | 3 ++- geocode/wikidata.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/geocode/mail.py b/geocode/mail.py index f9586fa..136a373 100644 --- a/geocode/mail.py +++ b/geocode/mail.py @@ -15,7 +15,8 @@ def send_to_admin(subject: str, body: str) -> None: msg["Subject"] = subject msg["To"] = ", ".join(app.config["ADMINS"]) - msg["From"] = f'{app.config["MAIL_FROM_NAME"]} <{app.config["MAIL_FROM"]}>' + mail_from_name = app.config.get("MAIL_FROM_NAME", mail_from) + msg["From"] = f"{mail_from_name} <{mail_from}>" msg["Date"] = formatdate() msg["Message-ID"] = make_msgid() diff --git a/geocode/wikidata.py b/geocode/wikidata.py index 14ea217..e9d4828 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -21,7 +21,10 @@ def giveup(details: backoff.types.Details) -> None: last_exception = details["exception"] # type: ignore if last_exception and isinstance(last_exception, APIResponseError): body = f"Error making Wikidata API call\n\n{last_exception.response.text}" - mail.send_to_admin("Geocode error", body) + try: + mail.send_to_admin("Geocode error", body) + except Exception: + pass class QueryError(Exception): From 1442620ece0e3a83c3b76615dcf4d75916d2551f Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 15 May 2026 15:19:02 +0000 Subject: [PATCH 4/6] Handle Wikidata API errors with proper HTTP responses (429, 503) Also retry on RequestException and raise QueryError on non-200 responses. Co-Authored-By: Claude Sonnet 4.6 --- geocode/wikidata.py | 5 ++++- lookup.py | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/geocode/wikidata.py b/geocode/wikidata.py index e9d4828..6145050 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -110,13 +110,16 @@ Row = dict[str, dict[str, typing.Any]] Hit = dict[str, str | int | None] -@backoff.on_exception(backoff.expo, QueryError, max_tries=5) +@backoff.on_exception(backoff.expo, (QueryError, RequestException), max_tries=5) def wdqs(query: str) -> list[Row]: """Pass query to the Wikidata Query Service.""" r = requests.post( wikidata_query_api_url, data={"query": query, "format": "json"}, headers=headers ) + if not r.ok: + raise QueryError(query, r) + try: return typing.cast(list[Row], r.json()["results"]["bindings"]) except requests.exceptions.JSONDecodeError: diff --git a/lookup.py b/lookup.py index 0759406..bebe7a3 100755 --- a/lookup.py +++ b/lookup.py @@ -10,6 +10,7 @@ import traceback import typing from time import time +import requests.exceptions import sqlalchemy.exc import werkzeug.debug.tbtools from flask import Flask, jsonify, redirect, render_template, request, url_for @@ -375,7 +376,16 @@ def index() -> str | Response: return jsonify(coords={"lat": lat, "lon": lon}, error=error_msg) needs_commons = request.args.get("needs_commons", "true").lower() != "false" - result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"] + try: + result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"] + except (wikidata.QueryError, wikidata.APIResponseError) as e: + r = e.r if isinstance(e, wikidata.QueryError) else e.response + if r.status_code == 429: + extra = {"Retry-After": r.headers["Retry-After"]} if "Retry-After" in r.headers else {} + return jsonify(error="Rate limited by Wikidata, please try again later"), 429, extra + return jsonify(error=f"Wikidata query failed (HTTP {r.status_code}), please try again later"), 503 + except requests.exceptions.RequestException: + return jsonify(error="Could not connect to Wikidata, please try again later"), 503 result.pop("element", None) result.pop("geojson", None) if logging_enabled: From 0188cbe0bf9550fca6cfb53d4373ed3a1cdf7898 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 15 May 2026 15:23:54 +0000 Subject: [PATCH 5/6] Show MediaWiki API error messages --- geocode/wikidata.py | 37 +++++++++++++++++++++++++++++++--- lookup.py | 27 ++++++++++++++++++++++--- tests/test_wikidata_api.py | 41 +++++++++++++++++++++++++++++++++++++- 3 files changed, 98 insertions(+), 7 deletions(-) diff --git a/geocode/wikidata.py b/geocode/wikidata.py index 6145050..6af0707 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -32,6 +32,7 @@ class QueryError(Exception): def __init__(self, query: str, r: requests.Response): """Init.""" + super().__init__(query, r) self.query = query self.r = r @@ -39,12 +40,33 @@ class QueryError(Exception): class APIResponseError(Exception): """Custom exception for API errors with response text.""" - def __init__(self, message: str, response: requests.Response): + def __init__( + self, message: str, response: requests.Response, detail: str | None = None + ): """Init.""" super().__init__(message) + self.message = message + self.detail = detail or message self.response = response +def mediawiki_error_message( + response: requests.Response, json_data: dict[str, typing.Any] | None = None +) -> str: + """Extract the useful error message from a MediaWiki API response.""" + if json_data and isinstance(json_data.get("error"), dict): + error = json_data["error"] + if isinstance(error.get("info"), str): + return typing.cast(str, error["info"]) + if isinstance(error.get("code"), str): + return typing.cast(str, error["code"]) + + if response.text: + return response.text.strip() + + return response.reason or f"HTTP {response.status_code}" + + @backoff.on_exception( backoff.expo, (RequestException, APIResponseError), @@ -58,9 +80,18 @@ def api_call(params: dict[str, str | int]) -> dict[str, typing.Any]: r = requests.get( "https://www.wikidata.org/w/api.php", params=api_params, headers=headers ) - return typing.cast(dict[str, typing.Any], r.json()) + json_data = typing.cast(dict[str, typing.Any], r.json()) except JSONDecodeError: - raise APIResponseError("Failed to decode JSON", r) + detail = mediawiki_error_message(r) + message = f"Wikidata API error (HTTP {r.status_code}): {detail}" + raise APIResponseError(message, r, detail) + + if not r.ok or "error" in json_data: + detail = mediawiki_error_message(r, json_data) + message = f"Wikidata API error (HTTP {r.status_code}): {detail}" + raise APIResponseError(message, r, detail) + + return json_data def get_entity(qid: str) -> dict[str, typing.Any] | None: diff --git a/lookup.py b/lookup.py index bebe7a3..9531063 100755 --- a/lookup.py +++ b/lookup.py @@ -381,9 +381,26 @@ def index() -> str | Response: except (wikidata.QueryError, wikidata.APIResponseError) as e: r = e.r if isinstance(e, wikidata.QueryError) else e.response if r.status_code == 429: - extra = {"Retry-After": r.headers["Retry-After"]} if "Retry-After" in r.headers else {} - return jsonify(error="Rate limited by Wikidata, please try again later"), 429, extra - return jsonify(error=f"Wikidata query failed (HTTP {r.status_code}), please try again later"), 503 + extra = ( + {"Retry-After": r.headers["Retry-After"]} + if "Retry-After" in r.headers + else {} + ) + error = ( + e.detail + if isinstance(e, wikidata.APIResponseError) + else wikidata.mediawiki_error_message(r) + ) + return jsonify(error=error), 429, extra + if isinstance(e, wikidata.APIResponseError): + return jsonify(error=e.message), 503 + return ( + jsonify( + error=f"Wikidata query failed (HTTP {r.status_code}), " + + "please try again later" + ), + 503, + ) except requests.exceptions.RequestException: return jsonify(error="Could not connect to Wikidata, please try again later"), 503 result.pop("element", None) @@ -457,6 +474,10 @@ def build_detail_page(lat: float, lon: float, needs_commons: bool = True) -> str except wikidata.QueryError as e: query, r = e.args return render_template("query_error.html", lat=lat, lon=lon, query=query, r=r) + except wikidata.APIResponseError as e: + return render_template( + "query_error.html", lat=lat, lon=lon, error=e.message, r=e.response + ) element = reply["result"].pop("element", None) geojson = reply["result"].pop("geojson", None) diff --git a/tests/test_wikidata_api.py b/tests/test_wikidata_api.py index 9877e2a..df630a5 100644 --- a/tests/test_wikidata_api.py +++ b/tests/test_wikidata_api.py @@ -2,7 +2,13 @@ import pytest import pytest_mock import requests import responses -from geocode.wikidata import APIResponseError, QueryError, api_call, wdqs +from geocode.wikidata import ( + APIResponseError, + QueryError, + api_call, + mediawiki_error_message, + wdqs, +) max_tries = 5 @@ -53,6 +59,39 @@ def test_api_call_retries_on_connection_error( assert mocked_sleep.call_count == max_tries - 1 +@responses.activate +def test_api_call_uses_mediawiki_error_message( + mocker: pytest_mock.plugin.MockerFixture, +) -> None: + """Test MediaWiki API error messages are preserved.""" + mocker.patch("time.sleep", return_value=None) + mocker.patch("geocode.mail.send_to_admin") + + responses.add( + responses.GET, + "https://www.wikidata.org/w/api.php", + json={"error": {"code": "ratelimited", "info": "Too many requests"}}, + status=429, + headers={"Retry-After": "10"}, + ) + + with pytest.raises(APIResponseError) as exc_info: + api_call({"action": "wbgetentities", "ids": "Q42"}) + + assert exc_info.value.detail == "Too many requests" + assert str(exc_info.value) == "Wikidata API error (HTTP 429): Too many requests" + + +def test_mediawiki_error_message_falls_back_to_response_text() -> None: + """Test plain-text MediaWiki API errors are preserved.""" + response = requests.Response() + response.status_code = 429 + response.reason = "Too Many Requests" + response._content = b"Please slow down" + + assert mediawiki_error_message(response) == "Please slow down" + + def test_wdqs_retry(mocker: pytest_mock.plugin.MockerFixture) -> None: """Test retry for WDQS API calls.""" # Patch 'time.sleep' to instantly return, effectively skipping the sleep From 93a4572a5d9dd6d8d81ab80d18cbc90bd3ed863a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 16 May 2026 09:08:30 +0000 Subject: [PATCH 6/6] Log Wikimedia API requests --- geocode/wikidata.py | 37 ++++++- geocode/wikimedia_api_logging.py | 169 +++++++++++++++++++++++++++++++ tests/test_wikidata_api.py | 38 +++++++ 3 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 geocode/wikimedia_api_logging.py diff --git a/geocode/wikidata.py b/geocode/wikidata.py index 6af0707..5f7c736 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -1,5 +1,7 @@ """Wikidata API functions.""" +import os +from pathlib import Path import typing import urllib.parse @@ -10,10 +12,37 @@ from flask import render_template from requests.exceptions import JSONDecodeError, RequestException from . import headers, mail +from .wikimedia_api_logging import WikimediaApiLogConfig, WikimediaRequestTimer wikidata_query_api_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" +wikidata_api_url = "https://www.wikidata.org/w/api.php" wd_entity = "http://www.wikidata.org/entity/Q" commons_cat_start = "https://commons.wikimedia.org/wiki/Category:" +wikimedia_log_config = WikimediaApiLogConfig( + tool="geocode", + log_path=Path( + os.environ.get( + "GEOCODE_WIKIMEDIA_API_LOG", "/var/log/geocode/wikimedia-api.jsonl" + ) + ), + user_agent=headers["User-Agent"], +) + + +def logged_get(url: str, **kwargs: typing.Any) -> requests.Response: + """Make a Wikimedia API request and log one JSONL metric line.""" + with WikimediaRequestTimer(wikimedia_log_config, "GET", url) as timer: + r = requests.get(url, **kwargs) + timer.log_response(r.status_code, r.url) + return r + + +def logged_post(url: str, **kwargs: typing.Any) -> requests.Response: + """Make a Wikimedia API request and log one JSONL metric line.""" + with WikimediaRequestTimer(wikimedia_log_config, "POST", url) as timer: + r = requests.post(url, **kwargs) + timer.log_response(r.status_code, r.url) + return r def giveup(details: backoff.types.Details) -> None: @@ -77,8 +106,10 @@ def api_call(params: dict[str, str | int]) -> dict[str, typing.Any]: """Wikidata API call.""" api_params: dict[str, str | int] = {"format": "json", "formatversion": 2, **params} try: - r = requests.get( - "https://www.wikidata.org/w/api.php", params=api_params, headers=headers + r = logged_get( + wikidata_api_url, + params=api_params, + headers=headers, ) json_data = typing.cast(dict[str, typing.Any], r.json()) except JSONDecodeError: @@ -144,7 +175,7 @@ Hit = dict[str, str | int | None] @backoff.on_exception(backoff.expo, (QueryError, RequestException), max_tries=5) def wdqs(query: str) -> list[Row]: """Pass query to the Wikidata Query Service.""" - r = requests.post( + r = logged_post( wikidata_query_api_url, data={"query": query, "format": "json"}, headers=headers ) diff --git a/geocode/wikimedia_api_logging.py b/geocode/wikimedia_api_logging.py new file mode 100644 index 0000000..e4ce9f4 --- /dev/null +++ b/geocode/wikimedia_api_logging.py @@ -0,0 +1,169 @@ +"""JSONL logging helpers for Wikimedia API request metrics.""" + +import json +import logging +import os +import socket +import time +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from types import TracebackType +from urllib.parse import parse_qs, urlparse + + +@dataclass(frozen=True) +class WikimediaApiLogConfig: + """Configuration for Wikimedia API request logging.""" + + tool: str + log_path: Path + user_agent: str + + +@dataclass(frozen=True) +class WikimediaApiRequestMetric: + """Details of one Wikimedia API request.""" + + tool: str + url: str + method: str + status_code: int | None + elapsed_ms: int + user_agent: str + error: str | None = None + + +_logger_cache: dict[Path, logging.Logger] = {} + + +def setup_wikimedia_api_logger(log_path: Path) -> logging.Logger: + """Create a JSONL logger for Wikimedia API request metrics.""" + if log_path in _logger_cache: + return _logger_cache[log_path] + + logger_name = f"wikimedia_api_metrics.{log_path}" + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + logger.propagate = False + + if not logger.handlers: + try: + log_path.parent.mkdir(parents=True, exist_ok=True) + handler: logging.Handler = logging.FileHandler(log_path) + except OSError: + handler = logging.NullHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + logger.addHandler(handler) + + _logger_cache[log_path] = logger + return logger + + +def get_mediawiki_action(url: str) -> str | None: + """Extract the MediaWiki API action from a URL, if present.""" + parsed = urlparse(url) + query = parse_qs(parsed.query) + values = query.get("action") + + if not values: + return None + + return values[0] + + +def build_log_record(metric: WikimediaApiRequestMetric) -> dict[str, object]: + """Build a JSON-serialisable log record for one API request.""" + parsed = urlparse(metric.url) + + record: dict[str, object] = { + "ts": datetime.now(UTC).isoformat(), + "tool": metric.tool, + "host": socket.gethostname(), + "pid": os.getpid(), + "method": metric.method, + "api_host": parsed.netloc, + "path": parsed.path, + "action": get_mediawiki_action(metric.url), + "status_code": metric.status_code, + "elapsed_ms": metric.elapsed_ms, + "user_agent": metric.user_agent, + } + + if metric.error is not None: + record["error"] = metric.error + + return record + + +def log_wikimedia_api_request( + logger: logging.Logger, + metric: WikimediaApiRequestMetric, +) -> None: + """Write one Wikimedia API request metric as a JSONL log line.""" + record = build_log_record(metric) + logger.info(json.dumps(record, separators=(",", ":"), sort_keys=True)) + + +class WikimediaRequestTimer: + """Context manager for timing and logging a Wikimedia API request.""" + + def __init__( + self, + config: WikimediaApiLogConfig, + method: str, + url: str, + ) -> None: + self.config = config + self.method = method + self.url = url + self.started = 0.0 + self.logger = setup_wikimedia_api_logger(config.log_path) + + def __enter__(self) -> "WikimediaRequestTimer": + """Start timing a request.""" + self.started = time.monotonic() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool: + """Log failed requests when an exception escapes.""" + if exc is None: + return False + + elapsed_ms = int((time.monotonic() - self.started) * 1000) + + log_wikimedia_api_request( + self.logger, + WikimediaApiRequestMetric( + tool=self.config.tool, + url=self.url, + method=self.method, + status_code=None, + elapsed_ms=elapsed_ms, + user_agent=self.config.user_agent, + error=type(exc).__name__, + ), + ) + + return False + + def log_response(self, status_code: int, final_url: str | None = None) -> None: + """Log a completed request.""" + elapsed_ms = int((time.monotonic() - self.started) * 1000) + + log_wikimedia_api_request( + self.logger, + WikimediaApiRequestMetric( + tool=self.config.tool, + url=final_url or self.url, + method=self.method, + status_code=status_code, + elapsed_ms=elapsed_ms, + user_agent=self.config.user_agent, + ), + ) diff --git a/tests/test_wikidata_api.py b/tests/test_wikidata_api.py index df630a5..5f5ae4f 100644 --- a/tests/test_wikidata_api.py +++ b/tests/test_wikidata_api.py @@ -1,7 +1,12 @@ +import json +from pathlib import Path + import pytest import pytest_mock import requests import responses +from geocode import headers +from geocode.wikimedia_api_logging import WikimediaApiLogConfig from geocode.wikidata import ( APIResponseError, QueryError, @@ -92,6 +97,39 @@ def test_mediawiki_error_message_falls_back_to_response_text() -> None: assert mediawiki_error_message(response) == "Please slow down" +@responses.activate +def test_api_call_logs_wikimedia_request( + mocker: pytest_mock.plugin.MockerFixture, tmp_path: Path +) -> None: + """Test Wikimedia API requests are logged as JSONL metrics.""" + log_path = tmp_path / "wikimedia-api.jsonl" + mocker.patch( + "geocode.wikidata.wikimedia_log_config", + WikimediaApiLogConfig( + tool="geocode", + log_path=log_path, + user_agent=headers["User-Agent"], + ), + ) + responses.add( + responses.GET, + "https://www.wikidata.org/w/api.php", + json={"entities": {"Q42": {"id": "Q42"}}}, + status=200, + ) + + api_call({"action": "wbgetentities", "ids": "Q42"}) + + record = json.loads(log_path.read_text().strip()) + assert record["tool"] == "geocode" + assert record["method"] == "GET" + assert record["api_host"] == "www.wikidata.org" + assert record["path"] == "/w/api.php" + assert record["action"] == "wbgetentities" + assert record["status_code"] == 200 + assert record["user_agent"] == headers["User-Agent"] + + def test_wdqs_retry(mocker: pytest_mock.plugin.MockerFixture) -> None: """Test retry for WDQS API calls.""" # Patch 'time.sleep' to instantly return, effectively skipping the sleep