Add code for downloading Wikidata items

This commit is contained in:
Edward Betts 2021-05-08 11:05:00 +02:00
parent 61ecfdef8b
commit 8fea2a8aa7
2 changed files with 118 additions and 0 deletions

52
matcher/wikidata.py Normal file
View file

@ -0,0 +1,52 @@
hq_pid = "P159"
coords_pid = "P625"
def read_coords(snak):
try:
v = snak["datavalue"]["value"]
except KeyError:
return
if v["globe"].rpartition("/")[2] != "Q2":
return
return {k: v[k] for k in ("latitude", "longitude")}
def read_hq_coords(claims):
if hq_pid not in claims:
return []
found = []
for hq_claim in claims[hq_pid]:
if "qualifiers" not in hq_claim:
continue
if coords_pid not in hq_claim["qualifiers"]:
continue
for snak in hq_claim["qualifiers"][coords_pid]:
coords = read_coords(snak)
if coords:
found.append(coords)
return found
def read_location_statement(claims, pid):
if pid not in claims:
return []
found = []
for statement in claims[pid]:
coords = read_coords(statement["mainsnak"])
if coords:
found.append(coords)
return found
def get_entity_coords(claims):
assert "claims" not in claims # make sure we weren't passed entity by mistake
ret = {
coords_pid: read_location_statement(claims, coords_pid),
hq_pid: read_hq_coords(claims),
}
return {pid: values for pid, values in ret.items() if values}

66
matcher/wikidata_api.py Normal file
View file

@ -0,0 +1,66 @@
import requests
import json
wd_api_url = "https://www.wikidata.org/w/api.php"
def api_get(params):
base_params = {
"format": "json",
"formatversion": 2,
}
return requests.get(wd_api_url, params={**base_params, **params})
def get_revision_timestamp(revid):
params = {
"action": "query",
"prop": "revisions",
"revids": revid,
"rvprop": "ids|timestamp",
}
r = api_get(params)
rev = r.json()["query"]["pages"][0]["revisions"][0]
assert rev["revid"] == int(revid)
return rev["timestamp"]
def get_recent_changes(**kwargs):
props = [
"title",
"ids",
"comment",
"parsedcomment",
"timestamp",
"redirect",
"loginfo",
]
params = {
"action": "query",
"list": "recentchanges",
"rcnamespace": 0,
# "rctype": "log",
# "rclimit": "max",
"rclimit": "max",
# "rcstart": start,
"rcdir": "newer",
"rcprop": "|".join(props),
**{k: v for k, v in kwargs.items() if v},
}
return api_get(params)
def get_entity(qid):
data = api_get({"action": "wbgetentities", "ids": qid}).json()
if "entities" not in data:
print(json.dumps(data, indent=2))
return data["entities"][qid]
def get_entities(ids):
r = api_get({"action": "wbgetentities", "ids": "|".join(ids)})
for qid, entity in r.json()["entities"].items():
yield qid, entity