From 8fea2a8aa7eb48e7bcb7a28c834a7e2ee518338a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 8 May 2021 11:05:00 +0200 Subject: [PATCH] Add code for downloading Wikidata items --- matcher/wikidata.py | 52 ++++++++++++++++++++++++++++++++ matcher/wikidata_api.py | 66 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 matcher/wikidata.py create mode 100644 matcher/wikidata_api.py diff --git a/matcher/wikidata.py b/matcher/wikidata.py new file mode 100644 index 0000000..9daea02 --- /dev/null +++ b/matcher/wikidata.py @@ -0,0 +1,52 @@ +hq_pid = "P159" +coords_pid = "P625" + + +def read_coords(snak): + try: + v = snak["datavalue"]["value"] + except KeyError: + return + if v["globe"].rpartition("/")[2] != "Q2": + return + + return {k: v[k] for k in ("latitude", "longitude")} + + +def read_hq_coords(claims): + if hq_pid not in claims: + return [] + + found = [] + for hq_claim in claims[hq_pid]: + if "qualifiers" not in hq_claim: + continue + if coords_pid not in hq_claim["qualifiers"]: + continue + for snak in hq_claim["qualifiers"][coords_pid]: + coords = read_coords(snak) + if coords: + found.append(coords) + + return found + + +def read_location_statement(claims, pid): + if pid not in claims: + return [] + + found = [] + for statement in claims[pid]: + coords = read_coords(statement["mainsnak"]) + if coords: + found.append(coords) + return found + + +def get_entity_coords(claims): + assert "claims" not in claims # make sure we weren't passed entity by mistake + ret = { + coords_pid: read_location_statement(claims, coords_pid), + hq_pid: read_hq_coords(claims), + } + return {pid: values for pid, values in ret.items() if values} diff --git a/matcher/wikidata_api.py b/matcher/wikidata_api.py new file mode 100644 index 0000000..ab0ad5a --- /dev/null +++ b/matcher/wikidata_api.py @@ -0,0 +1,66 @@ +import requests +import json + +wd_api_url = "https://www.wikidata.org/w/api.php" + + +def api_get(params): + base_params = { + "format": "json", + "formatversion": 2, + } + + return requests.get(wd_api_url, params={**base_params, **params}) + + +def get_revision_timestamp(revid): + params = { + "action": "query", + "prop": "revisions", + "revids": revid, + "rvprop": "ids|timestamp", + } + r = api_get(params) + rev = r.json()["query"]["pages"][0]["revisions"][0] + assert rev["revid"] == int(revid) + return rev["timestamp"] + + +def get_recent_changes(**kwargs): + props = [ + "title", + "ids", + "comment", + "parsedcomment", + "timestamp", + "redirect", + "loginfo", + ] + + params = { + "action": "query", + "list": "recentchanges", + "rcnamespace": 0, + # "rctype": "log", + # "rclimit": "max", + "rclimit": "max", + # "rcstart": start, + "rcdir": "newer", + "rcprop": "|".join(props), + **{k: v for k, v in kwargs.items() if v}, + } + + return api_get(params) + + +def get_entity(qid): + data = api_get({"action": "wbgetentities", "ids": qid}).json() + if "entities" not in data: + print(json.dumps(data, indent=2)) + return data["entities"][qid] + + +def get_entities(ids): + r = api_get({"action": "wbgetentities", "ids": "|".join(ids)}) + for qid, entity in r.json()["entities"].items(): + yield qid, entity