From 4d32f62dfb0f182d73dfb757be18ea7e66425614 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 6 Jun 2024 14:43:44 +0100 Subject: [PATCH 01/13] Upgrade to OAuth 2 --- matcher/osm_oauth.py | 71 +++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/matcher/osm_oauth.py b/matcher/osm_oauth.py index 98c1f4d..09eccae 100644 --- a/matcher/osm_oauth.py +++ b/matcher/osm_oauth.py @@ -1,67 +1,82 @@ """OSM Authentication.""" +import json import typing from datetime import datetime from urllib.parse import urlencode +import flask import lxml.etree -from flask import current_app, g, session -from requests_oauthlib import OAuth1Session +import requests +from requests_oauthlib import OAuth2Session from . import user_agent_headers from .model import User osm_api_base = "https://api.openstreetmap.org/api/0.6" +scope = ["read_prefs", "write_api"] -def api_put_request(path, **kwargs): - user = g.user - assert user.is_authenticated - oauth = OAuth1Session( - current_app.config["CLIENT_KEY"], - client_secret=current_app.config["CLIENT_SECRET"], - resource_owner_key=user.osm_oauth_token, - resource_owner_secret=user.osm_oauth_token_secret, +def get_session() -> OAuth2Session: + """Get session.""" + token = flask.session.get("oauth_token") + if not token: + user = flask.g.user + assert user.is_authenticated + token = json.loads(user.osm_oauth_token) + flask.session["oauth_token"] = token + + callback = flask.url_for("oauth_callback", _external=True) + return OAuth2Session( + flask.current_app.config["CLIENT_KEY"], + redirect_uri=callback, + scope=scope, + token=token, ) + + +def api_put_request(path: str, **kwargs: typing.Any) -> requests.Response: + """Send OSM API PUT request.""" + oauth = get_session() + return oauth.request( "PUT", osm_api_base + path, headers=user_agent_headers(), **kwargs ) -def api_request(path, **params): - user = g.user - assert user.is_authenticated - app = current_app +def api_request(path: str, **params: typing.Any) -> requests.Response: + """Send OSM API request.""" url = osm_api_base + path if params: url += "?" + urlencode(params) - client_key = app.config["CLIENT_KEY"] - client_secret = app.config["CLIENT_SECRET"] - oauth = OAuth1Session( - client_key, - client_secret=client_secret, - resource_owner_key=user.osm_oauth_token, - resource_owner_secret=user.osm_oauth_token_secret, - ) + + oauth = get_session() return oauth.get(url, timeout=4) -def parse_iso_date(value): +def parse_iso_date(value: str) -> datetime: + """Parse ISO date.""" return datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") -def parse_userinfo_call(xml): +def parse_userinfo_call(xml: bytes) -> dict[str, typing.Any]: + """Parse userinfo call.""" root = lxml.etree.fromstring(xml) user = root[0] img = user.find(".//img") - account_created = parse_iso_date(user.get("account_created")) + account_created_date = user.get("account_created") + assert account_created_date + account_created = parse_iso_date(account_created_date) assert user.tag == "user" + id_str = user.get("id") + assert id_str and isinstance(id_str, str) + return { "account_created": account_created, - "id": int(user.get("id")), + "id": int(id_str), "username": user.get("display_name"), "description": user.findtext(".//description"), "img": (img.get("href") if img is not None else None), @@ -70,10 +85,10 @@ def parse_userinfo_call(xml): def get_username() -> str | None: """Get username of current user.""" - if "user_id" not in session: + if "user_id" not in flask.session: return None # not authorized - user_id = session["user_id"] + user_id = flask.session["user_id"] user = User.query.get(user_id) return typing.cast(str, user.username) From 269c6bce54de3422c539d70e668da62469bf4425 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 6 Jun 2024 13:47:22 +0000 Subject: [PATCH 02/13] Need to use BigInteger for lastrevid --- matcher/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcher/model.py b/matcher/model.py index 40d56f5..4f61118 100644 --- a/matcher/model.py +++ b/matcher/model.py @@ -107,7 +107,7 @@ class Item(Base): aliases = Column(postgresql.JSONB) sitelinks = Column(postgresql.JSONB) claims = Column(postgresql.JSONB, nullable=False) - lastrevid = Column(Integer, nullable=False, unique=True) + lastrevid = Column(BigInteger, nullable=False, unique=True) locations: Mapped[list["ItemLocation"]] = relationship( "ItemLocation", cascade="all, delete-orphan", backref="item" ) From 13ecf4526d1f85a00199c89ccbcaf3ccc50450fa Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 6 Jun 2024 13:48:18 +0000 Subject: [PATCH 03/13] Catch MediaWiki database timeout errors and retry --- update.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/update.py b/update.py index f3626a7..ee26a3b 100755 --- a/update.py +++ b/update.py @@ -6,6 +6,8 @@ import json import typing from time import sleep +import requests.exceptions + from matcher import model, wikidata, wikidata_api from matcher.database import init_db, session @@ -81,7 +83,14 @@ def handle_edit(change: Change) -> None: print(f"{ts}: no need to update {qid}") return - entity = wikidata_api.get_entity(qid) + for attempt in range(100): + try: + entity = wikidata_api.get_entity(qid) + except requests.exceptions.ConnectionError: + print("connection error, retrying.") + sleep(10) + else: + break entity_qid = entity.pop("id") if entity_qid != qid: print(f"{ts}: item {qid} replaced with redirect") @@ -123,6 +132,15 @@ def update_database() -> None: r = wikidata_api.get_recent_changes(rcstart=start, rccontinue=rccontinue) reply = r.json() + if ( + "error" in reply + and reply["error"]["code"] == "internal_api_error_DBQueryTimeoutError" + ): + print(reply) + sleep(10) + continue + if "query" not in reply: + print(reply) for change in reply["query"]["recentchanges"]: rctype = change["type"] timestamp = change["timestamp"] From a04106ce1fd3ffc18c17187a2f5f9291ade6eb68 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 13:59:52 +0100 Subject: [PATCH 04/13] Bug fix format_wikibase_time centuries --- matcher/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcher/utils.py b/matcher/utils.py index dbe850f..55f3bd2 100644 --- a/matcher/utils.py +++ b/matcher/utils.py @@ -204,7 +204,7 @@ def format_wikibase_time(v: WikibaseTime) -> str | None: case 7: # century century = ((int(t[:5]) - 1) // 100) + 1 ordinal_num = num2words(abs(century), to="ordinal_num") - return f"{ordinal_num} {century}{' BC' if century < 0 else ''}" + return f"{ordinal_num} century{' BC' if century < 0 else ''}" case 6: # millennium millennium = ((int(t[:5]) - 1) // 1000) + 1 ordinal_num = num2words(abs(millennium), to="ordinal_num") From 4863eb59d5e26b17b656e699114bc749535d7f2f Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:00:28 +0100 Subject: [PATCH 05/13] Improve tests Closes: #1 --- tests/test_utils.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2064264..c9f6c7f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,30 @@ +"""Test matcher utils.""" + from matcher import utils -def test_format_wikibase_time_year(): +def test_format_wikibase_time_year() -> None: + """Test passing a year to format_wikibase_time.""" v = {"time": "+1950-00-00T00:00:00Z", "precision": 9} assert utils.format_wikibase_time(v) == "1950" -def test_format_wikibase_time_century(): +def test_format_wikibase_time_century() -> None: + """Test passing centuries to format_wikibase_time.""" v = {"time": "+0800-00-00T00:00:00Z", "precision": 7} assert utils.format_wikibase_time(v) == "8th century" v = {"time": "+1950-00-00T00:00:00Z", "precision": 7} assert utils.format_wikibase_time(v) == "20th century" + +def test_format_wikibase_time_decade() -> None: + """Test passing a full date to format_wikibase_time.""" + v = {"time": "+1910-00-00T00:00:00Z", "precision": 8} + assert utils.format_wikibase_time(v) == "1910s" + + +def test_format_wikibase_time_day() -> None: + """Test passing a full date to format_wikibase_time.""" v = {"time": "+1868-01-09T00:00:00Z", "precision": 11} assert utils.format_wikibase_time(v) == "9 January 1868" From 938f79ba19340434f92b284623edaef4da494ba6 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:23:20 +0100 Subject: [PATCH 06/13] Bug fix refresh_item --- web_view.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web_view.py b/web_view.py index 03ab0f3..4a2edac 100755 --- a/web_view.py +++ b/web_view.py @@ -676,7 +676,7 @@ def api_polygon(osm_type, osm_id): @app.route("/refresh/Q") def refresh_item(item_id: int) -> str: """Refresh the local mirror of a Wikidata item.""" - existing = model.Item.query.get(item_id) + item = model.Item.query.get(item_id) qid = f"Q{item_id}" entity = wikidata_api.get_entity(qid) @@ -686,9 +686,9 @@ def refresh_item(item_id: int) -> str: coords = wikidata.get_entity_coords(entity["claims"]) obj = {k: v for k, v in entity.items() if k in entity_keys} - if existing: + if item: for k, v in obj.items(): - setattr(model, k, v) + setattr(item, k, v) else: item = model.Item(item_id=item_id, **obj) database.session.add(item) From ba22e8e9df018dd3a8f5a01aa19f32f242ef7f62 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:32:51 +0100 Subject: [PATCH 07/13] Bug fix get_item_street_addresses --- matcher/api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/matcher/api.py b/matcher/api.py index 9c1d431..057c503 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -951,6 +951,9 @@ def get_item_street_addresses(item: model.Item) -> list[str]: qualifiers = claim.get("qualifiers") if not qualifiers or "P670" not in qualifiers: continue + if "datavalue" not in qualifiers["P670"]: + print(f"datavalue missing in P670 for {item.qid}") + continue number = qualifiers["P670"][0]["datavalue"]["value"] street_item = get_item(claim["mainsnak"]["datavalue"]["value"]["numeric-id"]) From 468d1e49c73c4ebafe02a73ab86f92904625a1ca Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:34:16 +0100 Subject: [PATCH 08/13] Better fix for get_item_street_addresses --- matcher/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcher/api.py b/matcher/api.py index 057c503..8ebc159 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -951,7 +951,7 @@ def get_item_street_addresses(item: model.Item) -> list[str]: qualifiers = claim.get("qualifiers") if not qualifiers or "P670" not in qualifiers: continue - if "datavalue" not in qualifiers["P670"]: + if "datavalue" not in qualifiers["P670"][0]: print(f"datavalue missing in P670 for {item.qid}") continue number = qualifiers["P670"][0]["datavalue"]["value"] From 3f04c82ba9a61901dd84059f557db1e0007e84b5 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:40:14 +0100 Subject: [PATCH 09/13] Improve get_item_street_addresses --- matcher/api.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/matcher/api.py b/matcher/api.py index 8ebc159..97e47f1 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -941,7 +941,11 @@ def get_item(item_id: int) -> model.Item | None: def get_item_street_addresses(item: model.Item) -> list[str]: """Hunt for street addresses for the given item.""" - street_address = [addr["text"] for addr in item.get_claim("P6375") if addr] + p6375 = item.get_claim("P6375") + assert isinstance(p6375, list) + street_address: list[str] = [ + typing.cast(str, addr["text"]) for addr in p6375 if addr + ] if street_address or "P669" not in item.claims: return street_address @@ -951,8 +955,8 @@ def get_item_street_addresses(item: model.Item) -> list[str]: qualifiers = claim.get("qualifiers") if not qualifiers or "P670" not in qualifiers: continue - if "datavalue" not in qualifiers["P670"][0]: - print(f"datavalue missing in P670 for {item.qid}") + if "datavalue" not in qualifiers["P670"][0]: # 'no value' for P670 + assert qualifiers["P670"][0]["snaktype"] == "novalue" continue number = qualifiers["P670"][0]["datavalue"]["value"] From 4d64ac212d300be018d2ac27e08870670756a523 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:43:23 +0100 Subject: [PATCH 10/13] Bug fix for calling get_commons_image with 'null' --- web_view.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/web_view.py b/web_view.py index 4a2edac..9003fde 100755 --- a/web_view.py +++ b/web_view.py @@ -244,9 +244,12 @@ def identifier_index(): @app.route("/commons/") def get_commons_image(filename): + if filename == "null": + flask.abort(404) detail = commons.image_detail([filename], thumbheight=1200, thumbwidth=1200) - image = detail[filename] - return flask.redirect(image["thumburl"]) + if filename not in detail: + flask.abort(404) + return flask.redirect(detail[filename]["thumburl"]) @app.route("/identifier/") From 72063f2e2b72070cd3697eefc530e87bd70a483d Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Wed, 19 Jun 2024 14:54:24 +0100 Subject: [PATCH 11/13] Bug fix expand_street_name --- web_view.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/web_view.py b/web_view.py index 9003fde..16049b3 100755 --- a/web_view.py +++ b/web_view.py @@ -555,7 +555,8 @@ def api_get_item_tags(item_id): ) -def expand_street_name(from_names): +def expand_street_name(from_names: typing.Collection[str]) -> set[str]: + """Expand street name.""" ret = set(from_names) for name in from_names: if any(name.startswith(st) for st in ("St ", "St. ")): @@ -563,12 +564,10 @@ def expand_street_name(from_names): ret.add("Saint" + name[first_space:]) if ", " in name: - for n in set(ret): - comma = n.find(", ") - ret.add(name[:comma]) + comma = name.find(", ") + ret.add(name[:comma]) elif "/" in name: - for n in set(ret): - ret.extend(part.strip() for part in n.split("/")) + ret.update(part.strip() for part in name.split("/")) ret.update({"The " + name for name in ret if not name.startswith("The ")}) return ret From fa02c59ae784056dc6189e843fc4bfe565a73dc8 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn Date: Sat, 4 May 2024 22:03:23 +0200 Subject: [PATCH 12/13] update gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 03f1c6e..a5e1081 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,8 @@ venv __pycache__ .mypy_cache/ config/default.py +node_modules/ +.vscode/ +config.default +package-lock.json +config/ From 78296ce189ff1bb78ddada2d9faf61f443932c3e Mon Sep 17 00:00:00 2001 From: Dennis Priskorn Date: Sat, 4 May 2024 22:03:47 +0200 Subject: [PATCH 13/13] WIP setup instructions --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..e54b665 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# OWL-MAP + +## Setup +* clone the project +* install geoip and postgres from your local package manager +* setup av venv +* enter the venv +* run 'pip install -r requirements.txt' +* cp config/examply.py config/default.py +* setup /var/lib/data/GeoLite2/GeoLite2-City.mmdb somehow +* run 'python web_view.py'