diff --git a/geocode/mail.py b/geocode/mail.py index f9586fa..136a373 100644 --- a/geocode/mail.py +++ b/geocode/mail.py @@ -15,7 +15,8 @@ def send_to_admin(subject: str, body: str) -> None: msg["Subject"] = subject msg["To"] = ", ".join(app.config["ADMINS"]) - msg["From"] = f'{app.config["MAIL_FROM_NAME"]} <{app.config["MAIL_FROM"]}>' + mail_from_name = app.config.get("MAIL_FROM_NAME", mail_from) + msg["From"] = f"{mail_from_name} <{mail_from}>" msg["Date"] = formatdate() msg["Message-ID"] = make_msgid() diff --git a/geocode/wikidata.py b/geocode/wikidata.py index 14ea217..5f7c736 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -1,5 +1,7 @@ """Wikidata API functions.""" +import os +from pathlib import Path import typing import urllib.parse @@ -10,10 +12,37 @@ from flask import render_template from requests.exceptions import JSONDecodeError, RequestException from . import headers, mail +from .wikimedia_api_logging import WikimediaApiLogConfig, WikimediaRequestTimer wikidata_query_api_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" +wikidata_api_url = "https://www.wikidata.org/w/api.php" wd_entity = "http://www.wikidata.org/entity/Q" commons_cat_start = "https://commons.wikimedia.org/wiki/Category:" +wikimedia_log_config = WikimediaApiLogConfig( + tool="geocode", + log_path=Path( + os.environ.get( + "GEOCODE_WIKIMEDIA_API_LOG", "/var/log/geocode/wikimedia-api.jsonl" + ) + ), + user_agent=headers["User-Agent"], +) + + +def logged_get(url: str, **kwargs: typing.Any) -> requests.Response: + """Make a Wikimedia API request and log one JSONL metric line.""" + with WikimediaRequestTimer(wikimedia_log_config, "GET", url) as timer: + r = requests.get(url, **kwargs) + timer.log_response(r.status_code, r.url) + return r + + +def logged_post(url: str, **kwargs: typing.Any) -> requests.Response: + """Make a Wikimedia API request and log one JSONL metric line.""" + with WikimediaRequestTimer(wikimedia_log_config, "POST", url) as timer: + r = requests.post(url, **kwargs) + timer.log_response(r.status_code, r.url) + return r def giveup(details: backoff.types.Details) -> None: @@ -21,7 +50,10 @@ def giveup(details: backoff.types.Details) -> None: last_exception = details["exception"] # type: ignore if last_exception and isinstance(last_exception, APIResponseError): body = f"Error making Wikidata API call\n\n{last_exception.response.text}" - mail.send_to_admin("Geocode error", body) + try: + mail.send_to_admin("Geocode error", body) + except Exception: + pass class QueryError(Exception): @@ -29,6 +61,7 @@ class QueryError(Exception): def __init__(self, query: str, r: requests.Response): """Init.""" + super().__init__(query, r) self.query = query self.r = r @@ -36,12 +69,33 @@ class QueryError(Exception): class APIResponseError(Exception): """Custom exception for API errors with response text.""" - def __init__(self, message: str, response: requests.Response): + def __init__( + self, message: str, response: requests.Response, detail: str | None = None + ): """Init.""" super().__init__(message) + self.message = message + self.detail = detail or message self.response = response +def mediawiki_error_message( + response: requests.Response, json_data: dict[str, typing.Any] | None = None +) -> str: + """Extract the useful error message from a MediaWiki API response.""" + if json_data and isinstance(json_data.get("error"), dict): + error = json_data["error"] + if isinstance(error.get("info"), str): + return typing.cast(str, error["info"]) + if isinstance(error.get("code"), str): + return typing.cast(str, error["code"]) + + if response.text: + return response.text.strip() + + return response.reason or f"HTTP {response.status_code}" + + @backoff.on_exception( backoff.expo, (RequestException, APIResponseError), @@ -52,12 +106,23 @@ def api_call(params: dict[str, str | int]) -> dict[str, typing.Any]: """Wikidata API call.""" api_params: dict[str, str | int] = {"format": "json", "formatversion": 2, **params} try: - r = requests.get( - "https://www.wikidata.org/w/api.php", params=api_params, headers=headers + r = logged_get( + wikidata_api_url, + params=api_params, + headers=headers, ) - return typing.cast(dict[str, typing.Any], r.json()) + json_data = typing.cast(dict[str, typing.Any], r.json()) except JSONDecodeError: - raise APIResponseError("Failed to decode JSON", r) + detail = mediawiki_error_message(r) + message = f"Wikidata API error (HTTP {r.status_code}): {detail}" + raise APIResponseError(message, r, detail) + + if not r.ok or "error" in json_data: + detail = mediawiki_error_message(r, json_data) + message = f"Wikidata API error (HTTP {r.status_code}): {detail}" + raise APIResponseError(message, r, detail) + + return json_data def get_entity(qid: str) -> dict[str, typing.Any] | None: @@ -107,13 +172,16 @@ Row = dict[str, dict[str, typing.Any]] Hit = dict[str, str | int | None] -@backoff.on_exception(backoff.expo, QueryError, max_tries=5) +@backoff.on_exception(backoff.expo, (QueryError, RequestException), max_tries=5) def wdqs(query: str) -> list[Row]: """Pass query to the Wikidata Query Service.""" - r = requests.post( + r = logged_post( wikidata_query_api_url, data={"query": query, "format": "json"}, headers=headers ) + if not r.ok: + raise QueryError(query, r) + try: return typing.cast(list[Row], r.json()["results"]["bindings"]) except requests.exceptions.JSONDecodeError: diff --git a/geocode/wikimedia_api_logging.py b/geocode/wikimedia_api_logging.py new file mode 100644 index 0000000..e4ce9f4 --- /dev/null +++ b/geocode/wikimedia_api_logging.py @@ -0,0 +1,169 @@ +"""JSONL logging helpers for Wikimedia API request metrics.""" + +import json +import logging +import os +import socket +import time +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from types import TracebackType +from urllib.parse import parse_qs, urlparse + + +@dataclass(frozen=True) +class WikimediaApiLogConfig: + """Configuration for Wikimedia API request logging.""" + + tool: str + log_path: Path + user_agent: str + + +@dataclass(frozen=True) +class WikimediaApiRequestMetric: + """Details of one Wikimedia API request.""" + + tool: str + url: str + method: str + status_code: int | None + elapsed_ms: int + user_agent: str + error: str | None = None + + +_logger_cache: dict[Path, logging.Logger] = {} + + +def setup_wikimedia_api_logger(log_path: Path) -> logging.Logger: + """Create a JSONL logger for Wikimedia API request metrics.""" + if log_path in _logger_cache: + return _logger_cache[log_path] + + logger_name = f"wikimedia_api_metrics.{log_path}" + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + logger.propagate = False + + if not logger.handlers: + try: + log_path.parent.mkdir(parents=True, exist_ok=True) + handler: logging.Handler = logging.FileHandler(log_path) + except OSError: + handler = logging.NullHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + logger.addHandler(handler) + + _logger_cache[log_path] = logger + return logger + + +def get_mediawiki_action(url: str) -> str | None: + """Extract the MediaWiki API action from a URL, if present.""" + parsed = urlparse(url) + query = parse_qs(parsed.query) + values = query.get("action") + + if not values: + return None + + return values[0] + + +def build_log_record(metric: WikimediaApiRequestMetric) -> dict[str, object]: + """Build a JSON-serialisable log record for one API request.""" + parsed = urlparse(metric.url) + + record: dict[str, object] = { + "ts": datetime.now(UTC).isoformat(), + "tool": metric.tool, + "host": socket.gethostname(), + "pid": os.getpid(), + "method": metric.method, + "api_host": parsed.netloc, + "path": parsed.path, + "action": get_mediawiki_action(metric.url), + "status_code": metric.status_code, + "elapsed_ms": metric.elapsed_ms, + "user_agent": metric.user_agent, + } + + if metric.error is not None: + record["error"] = metric.error + + return record + + +def log_wikimedia_api_request( + logger: logging.Logger, + metric: WikimediaApiRequestMetric, +) -> None: + """Write one Wikimedia API request metric as a JSONL log line.""" + record = build_log_record(metric) + logger.info(json.dumps(record, separators=(",", ":"), sort_keys=True)) + + +class WikimediaRequestTimer: + """Context manager for timing and logging a Wikimedia API request.""" + + def __init__( + self, + config: WikimediaApiLogConfig, + method: str, + url: str, + ) -> None: + self.config = config + self.method = method + self.url = url + self.started = 0.0 + self.logger = setup_wikimedia_api_logger(config.log_path) + + def __enter__(self) -> "WikimediaRequestTimer": + """Start timing a request.""" + self.started = time.monotonic() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool: + """Log failed requests when an exception escapes.""" + if exc is None: + return False + + elapsed_ms = int((time.monotonic() - self.started) * 1000) + + log_wikimedia_api_request( + self.logger, + WikimediaApiRequestMetric( + tool=self.config.tool, + url=self.url, + method=self.method, + status_code=None, + elapsed_ms=elapsed_ms, + user_agent=self.config.user_agent, + error=type(exc).__name__, + ), + ) + + return False + + def log_response(self, status_code: int, final_url: str | None = None) -> None: + """Log a completed request.""" + elapsed_ms = int((time.monotonic() - self.started) * 1000) + + log_wikimedia_api_request( + self.logger, + WikimediaApiRequestMetric( + tool=self.config.tool, + url=final_url or self.url, + method=self.method, + status_code=status_code, + elapsed_ms=elapsed_ms, + user_agent=self.config.user_agent, + ), + ) diff --git a/lookup.py b/lookup.py index 76c1745..9531063 100755 --- a/lookup.py +++ b/lookup.py @@ -5,12 +5,12 @@ import inspect import json import random import re -import socket import sys import traceback import typing from time import time +import requests.exceptions import sqlalchemy.exc import werkzeug.debug.tbtools from flask import Flask, jsonify, redirect, render_template, request, url_for @@ -376,7 +376,33 @@ def index() -> str | Response: return jsonify(coords={"lat": lat, "lon": lon}, error=error_msg) needs_commons = request.args.get("needs_commons", "true").lower() != "false" - result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"] + try: + result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"] + except (wikidata.QueryError, wikidata.APIResponseError) as e: + r = e.r if isinstance(e, wikidata.QueryError) else e.response + if r.status_code == 429: + extra = ( + {"Retry-After": r.headers["Retry-After"]} + if "Retry-After" in r.headers + else {} + ) + error = ( + e.detail + if isinstance(e, wikidata.APIResponseError) + else wikidata.mediawiki_error_message(r) + ) + return jsonify(error=error), 429, extra + if isinstance(e, wikidata.APIResponseError): + return jsonify(error=e.message), 503 + return ( + jsonify( + error=f"Wikidata query failed (HTTP {r.status_code}), " + + "please try again later" + ), + 503, + ) + except requests.exceptions.RequestException: + return jsonify(error="Could not connect to Wikidata, please try again later"), 503 result.pop("element", None) result.pop("geojson", None) if logging_enabled: @@ -385,7 +411,7 @@ def index() -> str | Response: lat=lat, lon=lon, remote_addr=remote_addr, - fqdn=socket.getfqdn(remote_addr) if remote_addr else None, + fqdn=None, result=result, response_time_ms=int((time() - t0) * 1000), ) @@ -448,6 +474,10 @@ def build_detail_page(lat: float, lon: float, needs_commons: bool = True) -> str except wikidata.QueryError as e: query, r = e.args return render_template("query_error.html", lat=lat, lon=lon, query=query, r=r) + except wikidata.APIResponseError as e: + return render_template( + "query_error.html", lat=lat, lon=lon, error=e.message, r=e.response + ) element = reply["result"].pop("element", None) geojson = reply["result"].pop("geojson", None) diff --git a/templates/index.html b/templates/index.html index c65c911..05d2187 100644 --- a/templates/index.html +++ b/templates/index.html @@ -40,7 +40,7 @@
Convert latitude/longitude to a Wikidata item and Wikimedia Commons category.
-