Compare commits
6 commits
b430411b7e
...
93a4572a5d
| Author | SHA1 | Date | |
|---|---|---|---|
| 93a4572a5d | |||
| 0188cbe0bf | |||
| 1442620ece | |||
| d75d617d11 | |||
| 808efc3cb4 | |||
| 132b881b08 |
6 changed files with 359 additions and 14 deletions
|
|
@ -15,7 +15,8 @@ def send_to_admin(subject: str, body: str) -> None:
|
|||
|
||||
msg["Subject"] = subject
|
||||
msg["To"] = ", ".join(app.config["ADMINS"])
|
||||
msg["From"] = f'{app.config["MAIL_FROM_NAME"]} <{app.config["MAIL_FROM"]}>'
|
||||
mail_from_name = app.config.get("MAIL_FROM_NAME", mail_from)
|
||||
msg["From"] = f"{mail_from_name} <{mail_from}>"
|
||||
msg["Date"] = formatdate()
|
||||
msg["Message-ID"] = make_msgid()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
"""Wikidata API functions."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
|
@ -10,10 +12,37 @@ from flask import render_template
|
|||
from requests.exceptions import JSONDecodeError, RequestException
|
||||
|
||||
from . import headers, mail
|
||||
from .wikimedia_api_logging import WikimediaApiLogConfig, WikimediaRequestTimer
|
||||
|
||||
wikidata_query_api_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
|
||||
wikidata_api_url = "https://www.wikidata.org/w/api.php"
|
||||
wd_entity = "http://www.wikidata.org/entity/Q"
|
||||
commons_cat_start = "https://commons.wikimedia.org/wiki/Category:"
|
||||
wikimedia_log_config = WikimediaApiLogConfig(
|
||||
tool="geocode",
|
||||
log_path=Path(
|
||||
os.environ.get(
|
||||
"GEOCODE_WIKIMEDIA_API_LOG", "/var/log/geocode/wikimedia-api.jsonl"
|
||||
)
|
||||
),
|
||||
user_agent=headers["User-Agent"],
|
||||
)
|
||||
|
||||
|
||||
def logged_get(url: str, **kwargs: typing.Any) -> requests.Response:
|
||||
"""Make a Wikimedia API request and log one JSONL metric line."""
|
||||
with WikimediaRequestTimer(wikimedia_log_config, "GET", url) as timer:
|
||||
r = requests.get(url, **kwargs)
|
||||
timer.log_response(r.status_code, r.url)
|
||||
return r
|
||||
|
||||
|
||||
def logged_post(url: str, **kwargs: typing.Any) -> requests.Response:
|
||||
"""Make a Wikimedia API request and log one JSONL metric line."""
|
||||
with WikimediaRequestTimer(wikimedia_log_config, "POST", url) as timer:
|
||||
r = requests.post(url, **kwargs)
|
||||
timer.log_response(r.status_code, r.url)
|
||||
return r
|
||||
|
||||
|
||||
def giveup(details: backoff.types.Details) -> None:
|
||||
|
|
@ -21,7 +50,10 @@ def giveup(details: backoff.types.Details) -> None:
|
|||
last_exception = details["exception"] # type: ignore
|
||||
if last_exception and isinstance(last_exception, APIResponseError):
|
||||
body = f"Error making Wikidata API call\n\n{last_exception.response.text}"
|
||||
mail.send_to_admin("Geocode error", body)
|
||||
try:
|
||||
mail.send_to_admin("Geocode error", body)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class QueryError(Exception):
|
||||
|
|
@ -29,6 +61,7 @@ class QueryError(Exception):
|
|||
|
||||
def __init__(self, query: str, r: requests.Response):
|
||||
"""Init."""
|
||||
super().__init__(query, r)
|
||||
self.query = query
|
||||
self.r = r
|
||||
|
||||
|
|
@ -36,12 +69,33 @@ class QueryError(Exception):
|
|||
class APIResponseError(Exception):
|
||||
"""Custom exception for API errors with response text."""
|
||||
|
||||
def __init__(self, message: str, response: requests.Response):
|
||||
def __init__(
|
||||
self, message: str, response: requests.Response, detail: str | None = None
|
||||
):
|
||||
"""Init."""
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.detail = detail or message
|
||||
self.response = response
|
||||
|
||||
|
||||
def mediawiki_error_message(
|
||||
response: requests.Response, json_data: dict[str, typing.Any] | None = None
|
||||
) -> str:
|
||||
"""Extract the useful error message from a MediaWiki API response."""
|
||||
if json_data and isinstance(json_data.get("error"), dict):
|
||||
error = json_data["error"]
|
||||
if isinstance(error.get("info"), str):
|
||||
return typing.cast(str, error["info"])
|
||||
if isinstance(error.get("code"), str):
|
||||
return typing.cast(str, error["code"])
|
||||
|
||||
if response.text:
|
||||
return response.text.strip()
|
||||
|
||||
return response.reason or f"HTTP {response.status_code}"
|
||||
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(RequestException, APIResponseError),
|
||||
|
|
@ -52,12 +106,23 @@ def api_call(params: dict[str, str | int]) -> dict[str, typing.Any]:
|
|||
"""Wikidata API call."""
|
||||
api_params: dict[str, str | int] = {"format": "json", "formatversion": 2, **params}
|
||||
try:
|
||||
r = requests.get(
|
||||
"https://www.wikidata.org/w/api.php", params=api_params, headers=headers
|
||||
r = logged_get(
|
||||
wikidata_api_url,
|
||||
params=api_params,
|
||||
headers=headers,
|
||||
)
|
||||
return typing.cast(dict[str, typing.Any], r.json())
|
||||
json_data = typing.cast(dict[str, typing.Any], r.json())
|
||||
except JSONDecodeError:
|
||||
raise APIResponseError("Failed to decode JSON", r)
|
||||
detail = mediawiki_error_message(r)
|
||||
message = f"Wikidata API error (HTTP {r.status_code}): {detail}"
|
||||
raise APIResponseError(message, r, detail)
|
||||
|
||||
if not r.ok or "error" in json_data:
|
||||
detail = mediawiki_error_message(r, json_data)
|
||||
message = f"Wikidata API error (HTTP {r.status_code}): {detail}"
|
||||
raise APIResponseError(message, r, detail)
|
||||
|
||||
return json_data
|
||||
|
||||
|
||||
def get_entity(qid: str) -> dict[str, typing.Any] | None:
|
||||
|
|
@ -107,13 +172,16 @@ Row = dict[str, dict[str, typing.Any]]
|
|||
Hit = dict[str, str | int | None]
|
||||
|
||||
|
||||
@backoff.on_exception(backoff.expo, QueryError, max_tries=5)
|
||||
@backoff.on_exception(backoff.expo, (QueryError, RequestException), max_tries=5)
|
||||
def wdqs(query: str) -> list[Row]:
|
||||
"""Pass query to the Wikidata Query Service."""
|
||||
r = requests.post(
|
||||
r = logged_post(
|
||||
wikidata_query_api_url, data={"query": query, "format": "json"}, headers=headers
|
||||
)
|
||||
|
||||
if not r.ok:
|
||||
raise QueryError(query, r)
|
||||
|
||||
try:
|
||||
return typing.cast(list[Row], r.json()["results"]["bindings"])
|
||||
except requests.exceptions.JSONDecodeError:
|
||||
|
|
|
|||
169
geocode/wikimedia_api_logging.py
Normal file
169
geocode/wikimedia_api_logging.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
"""JSONL logging helpers for Wikimedia API request metrics."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WikimediaApiLogConfig:
|
||||
"""Configuration for Wikimedia API request logging."""
|
||||
|
||||
tool: str
|
||||
log_path: Path
|
||||
user_agent: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WikimediaApiRequestMetric:
|
||||
"""Details of one Wikimedia API request."""
|
||||
|
||||
tool: str
|
||||
url: str
|
||||
method: str
|
||||
status_code: int | None
|
||||
elapsed_ms: int
|
||||
user_agent: str
|
||||
error: str | None = None
|
||||
|
||||
|
||||
_logger_cache: dict[Path, logging.Logger] = {}
|
||||
|
||||
|
||||
def setup_wikimedia_api_logger(log_path: Path) -> logging.Logger:
|
||||
"""Create a JSONL logger for Wikimedia API request metrics."""
|
||||
if log_path in _logger_cache:
|
||||
return _logger_cache[log_path]
|
||||
|
||||
logger_name = f"wikimedia_api_metrics.{log_path}"
|
||||
logger = logging.getLogger(logger_name)
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.propagate = False
|
||||
|
||||
if not logger.handlers:
|
||||
try:
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
handler: logging.Handler = logging.FileHandler(log_path)
|
||||
except OSError:
|
||||
handler = logging.NullHandler()
|
||||
handler.setFormatter(logging.Formatter("%(message)s"))
|
||||
logger.addHandler(handler)
|
||||
|
||||
_logger_cache[log_path] = logger
|
||||
return logger
|
||||
|
||||
|
||||
def get_mediawiki_action(url: str) -> str | None:
|
||||
"""Extract the MediaWiki API action from a URL, if present."""
|
||||
parsed = urlparse(url)
|
||||
query = parse_qs(parsed.query)
|
||||
values = query.get("action")
|
||||
|
||||
if not values:
|
||||
return None
|
||||
|
||||
return values[0]
|
||||
|
||||
|
||||
def build_log_record(metric: WikimediaApiRequestMetric) -> dict[str, object]:
|
||||
"""Build a JSON-serialisable log record for one API request."""
|
||||
parsed = urlparse(metric.url)
|
||||
|
||||
record: dict[str, object] = {
|
||||
"ts": datetime.now(UTC).isoformat(),
|
||||
"tool": metric.tool,
|
||||
"host": socket.gethostname(),
|
||||
"pid": os.getpid(),
|
||||
"method": metric.method,
|
||||
"api_host": parsed.netloc,
|
||||
"path": parsed.path,
|
||||
"action": get_mediawiki_action(metric.url),
|
||||
"status_code": metric.status_code,
|
||||
"elapsed_ms": metric.elapsed_ms,
|
||||
"user_agent": metric.user_agent,
|
||||
}
|
||||
|
||||
if metric.error is not None:
|
||||
record["error"] = metric.error
|
||||
|
||||
return record
|
||||
|
||||
|
||||
def log_wikimedia_api_request(
|
||||
logger: logging.Logger,
|
||||
metric: WikimediaApiRequestMetric,
|
||||
) -> None:
|
||||
"""Write one Wikimedia API request metric as a JSONL log line."""
|
||||
record = build_log_record(metric)
|
||||
logger.info(json.dumps(record, separators=(",", ":"), sort_keys=True))
|
||||
|
||||
|
||||
class WikimediaRequestTimer:
|
||||
"""Context manager for timing and logging a Wikimedia API request."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: WikimediaApiLogConfig,
|
||||
method: str,
|
||||
url: str,
|
||||
) -> None:
|
||||
self.config = config
|
||||
self.method = method
|
||||
self.url = url
|
||||
self.started = 0.0
|
||||
self.logger = setup_wikimedia_api_logger(config.log_path)
|
||||
|
||||
def __enter__(self) -> "WikimediaRequestTimer":
|
||||
"""Start timing a request."""
|
||||
self.started = time.monotonic()
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc: BaseException | None,
|
||||
traceback: TracebackType | None,
|
||||
) -> bool:
|
||||
"""Log failed requests when an exception escapes."""
|
||||
if exc is None:
|
||||
return False
|
||||
|
||||
elapsed_ms = int((time.monotonic() - self.started) * 1000)
|
||||
|
||||
log_wikimedia_api_request(
|
||||
self.logger,
|
||||
WikimediaApiRequestMetric(
|
||||
tool=self.config.tool,
|
||||
url=self.url,
|
||||
method=self.method,
|
||||
status_code=None,
|
||||
elapsed_ms=elapsed_ms,
|
||||
user_agent=self.config.user_agent,
|
||||
error=type(exc).__name__,
|
||||
),
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
def log_response(self, status_code: int, final_url: str | None = None) -> None:
|
||||
"""Log a completed request."""
|
||||
elapsed_ms = int((time.monotonic() - self.started) * 1000)
|
||||
|
||||
log_wikimedia_api_request(
|
||||
self.logger,
|
||||
WikimediaApiRequestMetric(
|
||||
tool=self.config.tool,
|
||||
url=final_url or self.url,
|
||||
method=self.method,
|
||||
status_code=status_code,
|
||||
elapsed_ms=elapsed_ms,
|
||||
user_agent=self.config.user_agent,
|
||||
),
|
||||
)
|
||||
36
lookup.py
36
lookup.py
|
|
@ -5,12 +5,12 @@ import inspect
|
|||
import json
|
||||
import random
|
||||
import re
|
||||
import socket
|
||||
import sys
|
||||
import traceback
|
||||
import typing
|
||||
from time import time
|
||||
|
||||
import requests.exceptions
|
||||
import sqlalchemy.exc
|
||||
import werkzeug.debug.tbtools
|
||||
from flask import Flask, jsonify, redirect, render_template, request, url_for
|
||||
|
|
@ -376,7 +376,33 @@ def index() -> str | Response:
|
|||
return jsonify(coords={"lat": lat, "lon": lon}, error=error_msg)
|
||||
|
||||
needs_commons = request.args.get("needs_commons", "true").lower() != "false"
|
||||
result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"]
|
||||
try:
|
||||
result = lat_lon_to_wikidata(lat, lon, needs_commons=needs_commons)["result"]
|
||||
except (wikidata.QueryError, wikidata.APIResponseError) as e:
|
||||
r = e.r if isinstance(e, wikidata.QueryError) else e.response
|
||||
if r.status_code == 429:
|
||||
extra = (
|
||||
{"Retry-After": r.headers["Retry-After"]}
|
||||
if "Retry-After" in r.headers
|
||||
else {}
|
||||
)
|
||||
error = (
|
||||
e.detail
|
||||
if isinstance(e, wikidata.APIResponseError)
|
||||
else wikidata.mediawiki_error_message(r)
|
||||
)
|
||||
return jsonify(error=error), 429, extra
|
||||
if isinstance(e, wikidata.APIResponseError):
|
||||
return jsonify(error=e.message), 503
|
||||
return (
|
||||
jsonify(
|
||||
error=f"Wikidata query failed (HTTP {r.status_code}), "
|
||||
+ "please try again later"
|
||||
),
|
||||
503,
|
||||
)
|
||||
except requests.exceptions.RequestException:
|
||||
return jsonify(error="Could not connect to Wikidata, please try again later"), 503
|
||||
result.pop("element", None)
|
||||
result.pop("geojson", None)
|
||||
if logging_enabled:
|
||||
|
|
@ -385,7 +411,7 @@ def index() -> str | Response:
|
|||
lat=lat,
|
||||
lon=lon,
|
||||
remote_addr=remote_addr,
|
||||
fqdn=socket.getfqdn(remote_addr) if remote_addr else None,
|
||||
fqdn=None,
|
||||
result=result,
|
||||
response_time_ms=int((time() - t0) * 1000),
|
||||
)
|
||||
|
|
@ -448,6 +474,10 @@ def build_detail_page(lat: float, lon: float, needs_commons: bool = True) -> str
|
|||
except wikidata.QueryError as e:
|
||||
query, r = e.args
|
||||
return render_template("query_error.html", lat=lat, lon=lon, query=query, r=r)
|
||||
except wikidata.APIResponseError as e:
|
||||
return render_template(
|
||||
"query_error.html", lat=lat, lon=lon, error=e.message, r=e.response
|
||||
)
|
||||
|
||||
element = reply["result"].pop("element", None)
|
||||
geojson = reply["result"].pop("geojson", None)
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@
|
|||
<h1 class="mb-1">Geocode to Commons Category</h1>
|
||||
<p class="text-muted mb-3">Convert latitude/longitude to a Wikidata item and Wikimedia Commons category.</p>
|
||||
|
||||
<form class="d-flex gap-2 align-items-center mb-3" action="/">
|
||||
<form class="d-flex gap-2 align-items-center mb-3" action="{{ url_for('index') }}">
|
||||
<label for="q" class="text-nowrap text-muted small">Lat, Lon</label>
|
||||
<input id="q" name="q" class="form-control form-control-sm" style="max-width:18rem" placeholder="e.g. 54.375, -2.999">
|
||||
<button type="submit" class="btn btn-sm btn-primary">Go</button>
|
||||
|
|
|
|||
|
|
@ -1,8 +1,19 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import pytest_mock
|
||||
import requests
|
||||
import responses
|
||||
from geocode.wikidata import APIResponseError, QueryError, api_call, wdqs
|
||||
from geocode import headers
|
||||
from geocode.wikimedia_api_logging import WikimediaApiLogConfig
|
||||
from geocode.wikidata import (
|
||||
APIResponseError,
|
||||
QueryError,
|
||||
api_call,
|
||||
mediawiki_error_message,
|
||||
wdqs,
|
||||
)
|
||||
|
||||
max_tries = 5
|
||||
|
||||
|
|
@ -53,6 +64,72 @@ def test_api_call_retries_on_connection_error(
|
|||
assert mocked_sleep.call_count == max_tries - 1
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_api_call_uses_mediawiki_error_message(
|
||||
mocker: pytest_mock.plugin.MockerFixture,
|
||||
) -> None:
|
||||
"""Test MediaWiki API error messages are preserved."""
|
||||
mocker.patch("time.sleep", return_value=None)
|
||||
mocker.patch("geocode.mail.send_to_admin")
|
||||
|
||||
responses.add(
|
||||
responses.GET,
|
||||
"https://www.wikidata.org/w/api.php",
|
||||
json={"error": {"code": "ratelimited", "info": "Too many requests"}},
|
||||
status=429,
|
||||
headers={"Retry-After": "10"},
|
||||
)
|
||||
|
||||
with pytest.raises(APIResponseError) as exc_info:
|
||||
api_call({"action": "wbgetentities", "ids": "Q42"})
|
||||
|
||||
assert exc_info.value.detail == "Too many requests"
|
||||
assert str(exc_info.value) == "Wikidata API error (HTTP 429): Too many requests"
|
||||
|
||||
|
||||
def test_mediawiki_error_message_falls_back_to_response_text() -> None:
|
||||
"""Test plain-text MediaWiki API errors are preserved."""
|
||||
response = requests.Response()
|
||||
response.status_code = 429
|
||||
response.reason = "Too Many Requests"
|
||||
response._content = b"Please slow down"
|
||||
|
||||
assert mediawiki_error_message(response) == "Please slow down"
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_api_call_logs_wikimedia_request(
|
||||
mocker: pytest_mock.plugin.MockerFixture, tmp_path: Path
|
||||
) -> None:
|
||||
"""Test Wikimedia API requests are logged as JSONL metrics."""
|
||||
log_path = tmp_path / "wikimedia-api.jsonl"
|
||||
mocker.patch(
|
||||
"geocode.wikidata.wikimedia_log_config",
|
||||
WikimediaApiLogConfig(
|
||||
tool="geocode",
|
||||
log_path=log_path,
|
||||
user_agent=headers["User-Agent"],
|
||||
),
|
||||
)
|
||||
responses.add(
|
||||
responses.GET,
|
||||
"https://www.wikidata.org/w/api.php",
|
||||
json={"entities": {"Q42": {"id": "Q42"}}},
|
||||
status=200,
|
||||
)
|
||||
|
||||
api_call({"action": "wbgetentities", "ids": "Q42"})
|
||||
|
||||
record = json.loads(log_path.read_text().strip())
|
||||
assert record["tool"] == "geocode"
|
||||
assert record["method"] == "GET"
|
||||
assert record["api_host"] == "www.wikidata.org"
|
||||
assert record["path"] == "/w/api.php"
|
||||
assert record["action"] == "wbgetentities"
|
||||
assert record["status_code"] == 200
|
||||
assert record["user_agent"] == headers["User-Agent"]
|
||||
|
||||
|
||||
def test_wdqs_retry(mocker: pytest_mock.plugin.MockerFixture) -> None:
|
||||
"""Test retry for WDQS API calls."""
|
||||
# Patch 'time.sleep' to instantly return, effectively skipping the sleep
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue