Compare commits

...

6 commits

Author SHA1 Message Date
93a4572a5d Log Wikimedia API requests 2026-05-16 09:08:30 +00:00
0188cbe0bf Show MediaWiki API error messages 2026-05-15 15:23:54 +00:00
1442620ece Handle Wikidata API errors with proper HTTP responses (429, 503)
Also retry on RequestException and raise QueryError on non-200 responses.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 15:19:02 +00:00
d75d617d11 Make mail sending resilient: optional MAIL_FROM_NAME config and swallow send errors
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 15:18:55 +00:00
808efc3cb4 Remove blocking socket.getfqdn() DNS lookup from request logging
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 15:18:47 +00:00
132b881b08 Use url_for for form action instead of hardcoded path
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 15:18:39 +00:00
6 changed files with 359 additions and 14 deletions

View file

@ -15,7 +15,8 @@ def send_to_admin(subject: str, body: str) -> None:
msg["Subject"] = subject
msg["To"] = ", ".join(app.config["ADMINS"])
msg["From"] = f'{app.config["MAIL_FROM_NAME"]} <{app.config["MAIL_FROM"]}>'
mail_from_name = app.config.get("MAIL_FROM_NAME", mail_from)
msg["From"] = f"{mail_from_name} <{mail_from}>"
msg["Date"] = formatdate()
msg["Message-ID"] = make_msgid()

View file

@ -1,5 +1,7 @@
"""Wikidata API functions."""
import os
from pathlib import Path
import typing
import urllib.parse
@ -10,10 +12,37 @@ from flask import render_template
from requests.exceptions import JSONDecodeError, RequestException
from . import headers, mail
from .wikimedia_api_logging import WikimediaApiLogConfig, WikimediaRequestTimer
wikidata_query_api_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
wikidata_api_url = "https://www.wikidata.org/w/api.php"
wd_entity = "http://www.wikidata.org/entity/Q"
commons_cat_start = "https://commons.wikimedia.org/wiki/Category:"
wikimedia_log_config = WikimediaApiLogConfig(
tool="geocode",
log_path=Path(
os.environ.get(
"GEOCODE_WIKIMEDIA_API_LOG", "/var/log/geocode/wikimedia-api.jsonl"
)
),
user_agent=headers["User-Agent"],
)
def logged_get(url: str, **kwargs: typing.Any) -> requests.Response:
"""Make a Wikimedia API request and log one JSONL metric line."""
with WikimediaRequestTimer(wikimedia_log_config, "GET", url) as timer:
r = requests.get(url, **kwargs)
timer.log_response(r.status_code, r.url)
return r
def logged_post(url: str, **kwargs: typing.Any) -> requests.Response:
"""Make a Wikimedia API request and log one JSONL metric line."""
with WikimediaRequestTimer(wikimedia_log_config, "POST", url) as timer:
r = requests.post(url, **kwargs)
timer.log_response(r.status_code, r.url)
return r
def giveup(details: backoff.types.Details) -> None:
@ -21,7 +50,10 @@ def giveup(details: backoff.types.Details) -> None:
last_exception = details["exception"] # type: ignore
if last_exception and isinstance(last_exception, APIResponseError):
body = f"Error making Wikidata API call\n\n{last_exception.response.text}"
mail.send_to_admin("Geocode error", body)
try:
mail.send_to_admin("Geocode error", body)
except Exception:
pass
class QueryError(Exception):
@ -29,6 +61,7 @@ class QueryError(Exception):
def __init__(self, query: str, r: requests.Response):
"""Init."""
super().__init__(query, r)
self.query = query
self.r = r
@ -36,12 +69,33 @@ class QueryError(Exception):
class APIResponseError(Exception):
"""Custom exception for API errors with response text."""
def __init__(self, message: str, response: requests.Response):
def __init__(
self, message: str, response: requests.Response, detail: str | None = None
):
"""Init."""
super().__init__(message)
self.message = message
self.detail = detail or message
self.response = response
def mediawiki_error_message(
response: requests.Response, json_data: dict[str, typing.Any] | None = None
) -> str:
"""Extract the useful error message from a MediaWiki API response."""
if json_data and isinstance(json_data.get("error"), dict):
error = json_data["error"]
if isinstance(error.get("info"), str):
return typing.cast(str, error["info"])
if isinstance(error.get("code"), str):
return typing.cast(str, error["code"])
if response.text:
return response.text.strip()
return response.reason or f"HTTP {response.status_code}"
@backoff.on_exception(
backoff.expo,
(RequestException, APIResponseError),
@ -52,12 +106,23 @@ def api_call(params: dict[str, str | int]) -> dict[str, typing.Any]:
"""Wikidata API call."""
api_params: dict[str, str | int] = {"format": "json", "formatversion": 2, **params}
try:
r = requests.get(
"https://www.wikidata.org/w/api.php", params=api_params, headers=headers
r = logged_get(
wikidata_api_url,
params=api_params,
headers=headers,
)
return typing.cast(dict[str, typing.Any], r.json())
json_data = typing.cast(dict[str, typing.Any], r.json())
except JSONDecodeError:
raise APIResponseError("Failed to decode JSON", r)
detail = mediawiki_error_message(r)
message = f"Wikidata API error (HTTP {r.status_code}): {detail}"
raise APIResponseError(message, r, detail)
if not r.ok or "error" in json_data:
detail = mediawiki_error_message(r, json_data)
message = f"Wikidata API error (HTTP {r.status_code}): {detail}"
raise APIResponseError(message, r, detail)
return json_data
def get_entity(qid: str) -> dict[str, typing.Any] | None:
@ -107,13 +172,16 @@ Row = dict[str, dict[str, typing.Any]]
Hit = dict[str, str | int | None]
@backoff.on_exception(backoff.expo, QueryError, max_tries=5)
@backoff.on_exception(backoff.expo, (QueryError, RequestException), max_tries=5)
def wdqs(query: str) -> list[Row]:
"""Pass query to the Wikidata Query Service."""
r = requests.post(
r = logged_post(
wikidata_query_api_url, data={"query": query, "format": "json"}, headers=headers
)
if not r.ok:
raise QueryError(query, r)
try:
return typing.cast(list[Row], r.json()["results"]["bindings"])
except requests.exceptions.JSONDecodeError:

View file

@ -0,0 +1,169 @@
"""JSONL logging helpers for Wikimedia API request metrics."""
import json
import logging
import os
import socket
import time
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from types import TracebackType
from urllib.parse import parse_qs, urlparse
@dataclass(frozen=True)
class WikimediaApiLogConfig:
"""Configuration for Wikimedia API request logging."""
tool: str
log_path: Path
user_agent: str
@dataclass(frozen=True)
class WikimediaApiRequestMetric:
"""Details of one Wikimedia API request."""
tool: str
url: str
method: str
status_code: int | None
elapsed_ms: int
user_agent: str
error: str | None = None
_logger_cache: dict[Path, logging.Logger] = {}
def setup_wikimedia_api_logger(log_path: Path) -> logging.Logger:
"""Create a JSONL logger for Wikimedia API request metrics."""
if log_path in _logger_cache:
return _logger_cache[log_path]
logger_name = f"wikimedia_api_metrics.{log_path}"
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)
logger.propagate = False
if not logger.handlers:
try:
log_path.parent.mkdir(parents=True, exist_ok=True)
handler: logging.Handler = logging.FileHandler(log_path)
except OSError:
handler = logging.NullHandler()
handler.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(handler)
_logger_cache[log_path] = logger
return logger
def get_mediawiki_action(url: str) -> str | None:
"""Extract the MediaWiki API action from a URL, if present."""
parsed = urlparse(url)
query = parse_qs(parsed.query)
values = query.get("action")
if not values:
return None
return values[0]
def build_log_record(metric: WikimediaApiRequestMetric) -> dict[str, object]:
"""Build a JSON-serialisable log record for one API request."""
parsed = urlparse(metric.url)
record: dict[str, object] = {
"ts": datetime.now(UTC).isoformat(),
"tool": metric.tool,
"host": socket.gethostname(),
"pid": os.getpid(),
"method": metric.method,
"api_host": parsed.netloc,
"path": parsed.path,
"action": get_mediawiki_action(metric.url),
"status_code": metric.status_code,
"elapsed_ms": metric.elapsed_ms,
"user_agent": metric.user_agent,
}
if metric.error is not None:
record["error"] = metric.error
return record
def log_wikimedia_api_request(
logger: logging.Logger,
metric: WikimediaApiRequestMetric,
) -> None:
"""Write one Wikimedia API request metric as a JSONL log line."""
record = build_log_record(metric)
logger.info(json.dumps(record, separators=(",", ":"), sort_keys=True))
class WikimediaRequestTimer:
"""Context manager for timing and logging a Wikimedia API request."""
def __init__(
self,
config: WikimediaApiLogConfig,
method: str,
url: str,
) -> None:
self.config = config
self.method = method
self.url = url
self.started = 0.0
self.logger = setup_wikimedia_api_logger(config.log_path)
def __enter__(self) -> "WikimediaRequestTimer":
"""Start timing a request."""
self.started = time.monotonic()
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc: BaseException | None,
traceback: TracebackType | None,
) -> bool:
"""Log failed requests when an exception escapes."""
if exc is None:
return False
elapsed_ms = int((time.monotonic() - self.started) * 1000)
log_wikimedia_api_request(
self.logger,
WikimediaApiRequestMetric(
tool=self.config.tool,
url=self.url,
method=self.method,
status_code=None,
elapsed_ms=elapsed_ms,
user_agent=self.config.user_agent,
error=type(exc).__name__,
),
)
return False
def log_response(self, status_code: int, final_url: str | None = None) -> None:
"""Log a completed request."""
elapsed_ms = int((time.monotonic() - self.started) * 1000)
log_wikimedia_api_request(
self.logger,
WikimediaApiRequestMetric(
tool=self.config.tool,
url=final_url or self.url,
method=self.method,
status_code=status_code,
elapsed_ms=elapsed_ms,
user_agent=self.config.user_agent,
),
)

View file

@ -5,12 +5,12 @@ import inspect
import json
import random
import re
import socket
import sys
import traceback
import typing
from time import time
import requests.exceptions
import sqlalchemy.exc
import werkzeug.debug.tbtools
from flask import Flask, jsonify, redirect, render_template, request, url_for
@ -376,7 +376,33 @@ def index() -> str | Response:
return jsonify(coords={"lat": lat, "lon": lon}, error=error_msg)
needs_commons = request.args.get("needs_commons", "true").lower() != "false"
result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"]
try:
result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"]
except (wikidata.QueryError, wikidata.APIResponseError) as e:
r = e.r if isinstance(e, wikidata.QueryError) else e.response
if r.status_code == 429:
extra = (
{"Retry-After": r.headers["Retry-After"]}
if "Retry-After" in r.headers
else {}
)
error = (
e.detail
if isinstance(e, wikidata.APIResponseError)
else wikidata.mediawiki_error_message(r)
)
return jsonify(error=error), 429, extra
if isinstance(e, wikidata.APIResponseError):
return jsonify(error=e.message), 503
return (
jsonify(
error=f"Wikidata query failed (HTTP {r.status_code}), "
+ "please try again later"
),
503,
)
except requests.exceptions.RequestException:
return jsonify(error="Could not connect to Wikidata, please try again later"), 503
result.pop("element", None)
result.pop("geojson", None)
if logging_enabled:
@ -385,7 +411,7 @@ def index() -> str | Response:
lat=lat,
lon=lon,
remote_addr=remote_addr,
fqdn=socket.getfqdn(remote_addr) if remote_addr else None,
fqdn=None,
result=result,
response_time_ms=int((time() - t0) * 1000),
)
@ -448,6 +474,10 @@ def build_detail_page(lat: float, lon: float, needs_commons: bool = True) -> str
except wikidata.QueryError as e:
query, r = e.args
return render_template("query_error.html", lat=lat, lon=lon, query=query, r=r)
except wikidata.APIResponseError as e:
return render_template(
"query_error.html", lat=lat, lon=lon, error=e.message, r=e.response
)
element = reply["result"].pop("element", None)
geojson = reply["result"].pop("geojson", None)

View file

@ -40,7 +40,7 @@
<h1 class="mb-1">Geocode to Commons Category</h1>
<p class="text-muted mb-3">Convert latitude/longitude to a Wikidata item and Wikimedia Commons category.</p>
<form class="d-flex gap-2 align-items-center mb-3" action="/">
<form class="d-flex gap-2 align-items-center mb-3" action="{{ url_for('index') }}">
<label for="q" class="text-nowrap text-muted small">Lat, Lon</label>
<input id="q" name="q" class="form-control form-control-sm" style="max-width:18rem" placeholder="e.g. 54.375, -2.999">
<button type="submit" class="btn btn-sm btn-primary">Go</button>

View file

@ -1,8 +1,19 @@
import json
from pathlib import Path
import pytest
import pytest_mock
import requests
import responses
from geocode.wikidata import APIResponseError, QueryError, api_call, wdqs
from geocode import headers
from geocode.wikimedia_api_logging import WikimediaApiLogConfig
from geocode.wikidata import (
APIResponseError,
QueryError,
api_call,
mediawiki_error_message,
wdqs,
)
max_tries = 5
@ -53,6 +64,72 @@ def test_api_call_retries_on_connection_error(
assert mocked_sleep.call_count == max_tries - 1
@responses.activate
def test_api_call_uses_mediawiki_error_message(
mocker: pytest_mock.plugin.MockerFixture,
) -> None:
"""Test MediaWiki API error messages are preserved."""
mocker.patch("time.sleep", return_value=None)
mocker.patch("geocode.mail.send_to_admin")
responses.add(
responses.GET,
"https://www.wikidata.org/w/api.php",
json={"error": {"code": "ratelimited", "info": "Too many requests"}},
status=429,
headers={"Retry-After": "10"},
)
with pytest.raises(APIResponseError) as exc_info:
api_call({"action": "wbgetentities", "ids": "Q42"})
assert exc_info.value.detail == "Too many requests"
assert str(exc_info.value) == "Wikidata API error (HTTP 429): Too many requests"
def test_mediawiki_error_message_falls_back_to_response_text() -> None:
"""Test plain-text MediaWiki API errors are preserved."""
response = requests.Response()
response.status_code = 429
response.reason = "Too Many Requests"
response._content = b"Please slow down"
assert mediawiki_error_message(response) == "Please slow down"
@responses.activate
def test_api_call_logs_wikimedia_request(
mocker: pytest_mock.plugin.MockerFixture, tmp_path: Path
) -> None:
"""Test Wikimedia API requests are logged as JSONL metrics."""
log_path = tmp_path / "wikimedia-api.jsonl"
mocker.patch(
"geocode.wikidata.wikimedia_log_config",
WikimediaApiLogConfig(
tool="geocode",
log_path=log_path,
user_agent=headers["User-Agent"],
),
)
responses.add(
responses.GET,
"https://www.wikidata.org/w/api.php",
json={"entities": {"Q42": {"id": "Q42"}}},
status=200,
)
api_call({"action": "wbgetentities", "ids": "Q42"})
record = json.loads(log_path.read_text().strip())
assert record["tool"] == "geocode"
assert record["method"] == "GET"
assert record["api_host"] == "www.wikidata.org"
assert record["path"] == "/w/api.php"
assert record["action"] == "wbgetentities"
assert record["status_code"] == 200
assert record["user_agent"] == headers["User-Agent"]
def test_wdqs_retry(mocker: pytest_mock.plugin.MockerFixture) -> None:
"""Test retry for WDQS API calls."""
# Patch 'time.sleep' to instantly return, effectively skipping the sleep