From 93a4572a5d9dd6d8d81ab80d18cbc90bd3ed863a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 16 May 2026 09:08:30 +0000 Subject: [PATCH] Log Wikimedia API requests --- geocode/wikidata.py | 37 ++++++- geocode/wikimedia_api_logging.py | 169 +++++++++++++++++++++++++++++++ tests/test_wikidata_api.py | 38 +++++++ 3 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 geocode/wikimedia_api_logging.py diff --git a/geocode/wikidata.py b/geocode/wikidata.py index 6af0707..5f7c736 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -1,5 +1,7 @@ """Wikidata API functions.""" +import os +from pathlib import Path import typing import urllib.parse @@ -10,10 +12,37 @@ from flask import render_template from requests.exceptions import JSONDecodeError, RequestException from . import headers, mail +from .wikimedia_api_logging import WikimediaApiLogConfig, WikimediaRequestTimer wikidata_query_api_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" +wikidata_api_url = "https://www.wikidata.org/w/api.php" wd_entity = "http://www.wikidata.org/entity/Q" commons_cat_start = "https://commons.wikimedia.org/wiki/Category:" +wikimedia_log_config = WikimediaApiLogConfig( + tool="geocode", + log_path=Path( + os.environ.get( + "GEOCODE_WIKIMEDIA_API_LOG", "/var/log/geocode/wikimedia-api.jsonl" + ) + ), + user_agent=headers["User-Agent"], +) + + +def logged_get(url: str, **kwargs: typing.Any) -> requests.Response: + """Make a Wikimedia API request and log one JSONL metric line.""" + with WikimediaRequestTimer(wikimedia_log_config, "GET", url) as timer: + r = requests.get(url, **kwargs) + timer.log_response(r.status_code, r.url) + return r + + +def logged_post(url: str, **kwargs: typing.Any) -> requests.Response: + """Make a Wikimedia API request and log one JSONL metric line.""" + with WikimediaRequestTimer(wikimedia_log_config, "POST", url) as timer: + r = requests.post(url, **kwargs) + timer.log_response(r.status_code, r.url) + return r def giveup(details: backoff.types.Details) -> None: @@ -77,8 +106,10 @@ def api_call(params: dict[str, str | int]) -> dict[str, typing.Any]: """Wikidata API call.""" api_params: dict[str, str | int] = {"format": "json", "formatversion": 2, **params} try: - r = requests.get( - "https://www.wikidata.org/w/api.php", params=api_params, headers=headers + r = logged_get( + wikidata_api_url, + params=api_params, + headers=headers, ) json_data = typing.cast(dict[str, typing.Any], r.json()) except JSONDecodeError: @@ -144,7 +175,7 @@ Hit = dict[str, str | int | None] @backoff.on_exception(backoff.expo, (QueryError, RequestException), max_tries=5) def wdqs(query: str) -> list[Row]: """Pass query to the Wikidata Query Service.""" - r = requests.post( + r = logged_post( wikidata_query_api_url, data={"query": query, "format": "json"}, headers=headers ) diff --git a/geocode/wikimedia_api_logging.py b/geocode/wikimedia_api_logging.py new file mode 100644 index 0000000..e4ce9f4 --- /dev/null +++ b/geocode/wikimedia_api_logging.py @@ -0,0 +1,169 @@ +"""JSONL logging helpers for Wikimedia API request metrics.""" + +import json +import logging +import os +import socket +import time +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from types import TracebackType +from urllib.parse import parse_qs, urlparse + + +@dataclass(frozen=True) +class WikimediaApiLogConfig: + """Configuration for Wikimedia API request logging.""" + + tool: str + log_path: Path + user_agent: str + + +@dataclass(frozen=True) +class WikimediaApiRequestMetric: + """Details of one Wikimedia API request.""" + + tool: str + url: str + method: str + status_code: int | None + elapsed_ms: int + user_agent: str + error: str | None = None + + +_logger_cache: dict[Path, logging.Logger] = {} + + +def setup_wikimedia_api_logger(log_path: Path) -> logging.Logger: + """Create a JSONL logger for Wikimedia API request metrics.""" + if log_path in _logger_cache: + return _logger_cache[log_path] + + logger_name = f"wikimedia_api_metrics.{log_path}" + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + logger.propagate = False + + if not logger.handlers: + try: + log_path.parent.mkdir(parents=True, exist_ok=True) + handler: logging.Handler = logging.FileHandler(log_path) + except OSError: + handler = logging.NullHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + logger.addHandler(handler) + + _logger_cache[log_path] = logger + return logger + + +def get_mediawiki_action(url: str) -> str | None: + """Extract the MediaWiki API action from a URL, if present.""" + parsed = urlparse(url) + query = parse_qs(parsed.query) + values = query.get("action") + + if not values: + return None + + return values[0] + + +def build_log_record(metric: WikimediaApiRequestMetric) -> dict[str, object]: + """Build a JSON-serialisable log record for one API request.""" + parsed = urlparse(metric.url) + + record: dict[str, object] = { + "ts": datetime.now(UTC).isoformat(), + "tool": metric.tool, + "host": socket.gethostname(), + "pid": os.getpid(), + "method": metric.method, + "api_host": parsed.netloc, + "path": parsed.path, + "action": get_mediawiki_action(metric.url), + "status_code": metric.status_code, + "elapsed_ms": metric.elapsed_ms, + "user_agent": metric.user_agent, + } + + if metric.error is not None: + record["error"] = metric.error + + return record + + +def log_wikimedia_api_request( + logger: logging.Logger, + metric: WikimediaApiRequestMetric, +) -> None: + """Write one Wikimedia API request metric as a JSONL log line.""" + record = build_log_record(metric) + logger.info(json.dumps(record, separators=(",", ":"), sort_keys=True)) + + +class WikimediaRequestTimer: + """Context manager for timing and logging a Wikimedia API request.""" + + def __init__( + self, + config: WikimediaApiLogConfig, + method: str, + url: str, + ) -> None: + self.config = config + self.method = method + self.url = url + self.started = 0.0 + self.logger = setup_wikimedia_api_logger(config.log_path) + + def __enter__(self) -> "WikimediaRequestTimer": + """Start timing a request.""" + self.started = time.monotonic() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool: + """Log failed requests when an exception escapes.""" + if exc is None: + return False + + elapsed_ms = int((time.monotonic() - self.started) * 1000) + + log_wikimedia_api_request( + self.logger, + WikimediaApiRequestMetric( + tool=self.config.tool, + url=self.url, + method=self.method, + status_code=None, + elapsed_ms=elapsed_ms, + user_agent=self.config.user_agent, + error=type(exc).__name__, + ), + ) + + return False + + def log_response(self, status_code: int, final_url: str | None = None) -> None: + """Log a completed request.""" + elapsed_ms = int((time.monotonic() - self.started) * 1000) + + log_wikimedia_api_request( + self.logger, + WikimediaApiRequestMetric( + tool=self.config.tool, + url=final_url or self.url, + method=self.method, + status_code=status_code, + elapsed_ms=elapsed_ms, + user_agent=self.config.user_agent, + ), + ) diff --git a/tests/test_wikidata_api.py b/tests/test_wikidata_api.py index df630a5..5f5ae4f 100644 --- a/tests/test_wikidata_api.py +++ b/tests/test_wikidata_api.py @@ -1,7 +1,12 @@ +import json +from pathlib import Path + import pytest import pytest_mock import requests import responses +from geocode import headers +from geocode.wikimedia_api_logging import WikimediaApiLogConfig from geocode.wikidata import ( APIResponseError, QueryError, @@ -92,6 +97,39 @@ def test_mediawiki_error_message_falls_back_to_response_text() -> None: assert mediawiki_error_message(response) == "Please slow down" +@responses.activate +def test_api_call_logs_wikimedia_request( + mocker: pytest_mock.plugin.MockerFixture, tmp_path: Path +) -> None: + """Test Wikimedia API requests are logged as JSONL metrics.""" + log_path = tmp_path / "wikimedia-api.jsonl" + mocker.patch( + "geocode.wikidata.wikimedia_log_config", + WikimediaApiLogConfig( + tool="geocode", + log_path=log_path, + user_agent=headers["User-Agent"], + ), + ) + responses.add( + responses.GET, + "https://www.wikidata.org/w/api.php", + json={"entities": {"Q42": {"id": "Q42"}}}, + status=200, + ) + + api_call({"action": "wbgetentities", "ids": "Q42"}) + + record = json.loads(log_path.read_text().strip()) + assert record["tool"] == "geocode" + assert record["method"] == "GET" + assert record["api_host"] == "www.wikidata.org" + assert record["path"] == "/w/api.php" + assert record["action"] == "wbgetentities" + assert record["status_code"] == 200 + assert record["user_agent"] == headers["User-Agent"] + + def test_wdqs_retry(mocker: pytest_mock.plugin.MockerFixture) -> None: """Test retry for WDQS API calls.""" # Patch 'time.sleep' to instantly return, effectively skipping the sleep