From 5ac6260d602960df3de44ce7788c3d55843e6b29 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 25 Sep 2023 18:38:02 +0100 Subject: [PATCH] Improvements --- confarchive/wikidata.py | 55 ++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/confarchive/wikidata.py b/confarchive/wikidata.py index 19fd78f..82342c5 100644 --- a/confarchive/wikidata.py +++ b/confarchive/wikidata.py @@ -8,12 +8,14 @@ import time import typing import requests +import flask commons_url = "https://www.wikidata.org/w/api.php" wikidata_api = "https://www.wikidata.org/w/api.php" user_agent = "conference-archive/0.1 (contact: edward@4angle.com)" CallParams = dict[str, str | int] +SearchHit = dict[str, typing.Any] s = requests.Session() s.headers.update({"User-Agent": user_agent}) @@ -24,29 +26,39 @@ def md5sum(s: str) -> str: return hashlib.md5(s.encode("utf-8")).hexdigest() -def search(q: str) -> list[dict[str, typing.Any]]: - """Search Wikidata with caching.""" +def get_cache_filename(q: str) -> str: + """Cache filename for query.""" q_md5 = md5sum(q) + data_dir = flask.current_app.config["DATA_DIR"] + return os.path.join(data_dir, "cache", q_md5 + ".json") - cache_filename = os.path.join("cache", q_md5 + ".json") + +def get_item_filename(qid: str) -> str: + data_dir = flask.current_app.config["DATA_DIR"] + return os.path.join(data_dir, "item", qid + ".json") + + +def search(q: str) -> list[SearchHit]: + """Search Wikidata with caching.""" + cache_filename = get_cache_filename(q) if os.path.exists(cache_filename): data = json.load(open(cache_filename)) - else: - params: dict[str, str | int] = { - "action": "query", - "list": "search", - "format": "json", - "formatversion": 2, - "srsearch": q, - "srlimit": "10", - } - r = requests.get(wikidata_api, params=params) - open(cache_filename, "w").write(r.text) - data = r.json() - time.sleep(1) + return typing.cast(list[SearchHit], data["query"]["search"]) - return typing.cast(list[dict[str, typing.Any]], data["query"]["search"]) + params: dict[str, str | int] = { + "action": "query", + "list": "search", + "format": "json", + "formatversion": 2, + "srsearch": q, + "srlimit": "10", + } + r = requests.get(wikidata_api, params=params) + open(cache_filename, "w").write(r.text) + data = r.json() + + return typing.cast(list[SearchHit], data["query"]["search"]) def api_image_detail_call(filename: str) -> requests.Response: @@ -65,9 +77,9 @@ def api_image_detail_call(filename: str) -> requests.Response: def get_item(qid: str) -> typing.Any: """Get an item from Wikidata.""" - cache_filename = os.path.join("items", qid + ".json") - if os.path.exists(cache_filename): - item = json.load(open(cache_filename)) + item_filename = get_item_filename(qid) + if os.path.exists(item_filename): + item = json.load(open(item_filename)) else: params: dict[str, str | int] = { "action": "wbgetentities", @@ -77,9 +89,8 @@ def get_item(qid: str) -> typing.Any: } r = s.get(wikidata_api, params=params) item = r.json()["entities"][qid] - with open(cache_filename, "w") as f: + with open(item_filename, "w") as f: json.dump(item, f, indent=2) - time.sleep(0.1) return item