owl-map/matcher/wikidata.py

134 lines
4 KiB
Python
Raw Normal View History

2023-05-14 21:20:41 +01:00
"""Process Wikidata items to extract coordinates and names."""
2021-11-14 08:01:19 +00:00
import re
2023-05-14 21:20:41 +01:00
import typing
from collections import defaultdict
from typing import Any, cast
from . import wikidata_api
2021-11-14 08:01:19 +00:00
hq_pid = "P159"
coords_pid = "P625"
2023-05-14 21:20:41 +01:00
Claims = dict[str, list[dict[str, Any]]]
class Coords(typing.TypedDict):
"""Coordinates."""
latitude: float
longitude: float
2023-05-14 21:20:41 +01:00
def read_coords(snak: dict[str, Any]) -> Coords | None:
"""Read coordinates from snak."""
try:
v = snak["datavalue"]["value"]
except KeyError:
2023-05-14 21:20:41 +01:00
return None
if v["globe"].rpartition("/")[2] != "Q2":
2023-05-14 21:20:41 +01:00
return None
2023-05-14 21:20:41 +01:00
return cast(Coords, {k: v[k] for k in ("latitude", "longitude")})
2023-05-14 21:20:41 +01:00
def read_hq_coords(claims: Claims) -> list[Coords]:
"""Coordinates of item headquarters."""
found: list[Coords] = []
for hq_claim in claims.get(hq_pid, []):
for snak in hq_claim.get("qualifiers", {}).get(coords_pid, []):
if coords := read_coords(snak):
found.append(coords)
return found
2023-05-14 21:20:41 +01:00
def read_location_statement(claims: Claims, pid: str) -> list[Coords]:
"""Get coordinates from given claim."""
return [i for i in (read_coords(c["mainsnak"]) for c in claims.get(pid, [])) if i]
2023-05-14 21:20:41 +01:00
def get_entity_coords(claims: Claims) -> dict[str, Any]:
"""Read entity coordinate locations from claims dict."""
assert "claims" not in claims # make sure we weren't passed entity by mistake
ret = {
coords_pid: read_location_statement(claims, coords_pid),
hq_pid: read_hq_coords(claims),
}
return {pid: values for pid, values in ret.items() if values}
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
def names_from_entity(
entity: wikidata_api.EntityType, skip_lang: set[str] | None = None
) -> dict[str, Any]:
"""Find all sources of names from Item."""
2021-11-14 08:01:19 +00:00
if skip_lang is None:
skip_lang = set()
ret = defaultdict(list)
2023-05-14 21:20:41 +01:00
cat_start = "Category:"
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
for k, v in entity["labels"].items():
2021-11-14 08:01:19 +00:00
if k in skip_lang:
continue
2023-05-14 21:20:41 +01:00
ret[v["value"]].append(("label", k))
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
for k, v in entity["sitelinks"].items():
if k + "wiki" in skip_lang:
2021-11-14 08:01:19 +00:00
continue
2023-05-14 21:20:41 +01:00
title = v["title"]
2021-11-14 08:01:19 +00:00
if title.startswith(cat_start):
2023-05-14 21:20:41 +01:00
title = title[len(cat_start) :]
2021-11-14 08:01:19 +00:00
first_letter = title[0]
if first_letter.isupper():
lc_first_title = first_letter.lower() + title[1:]
if lc_first_title in ret:
title = lc_first_title
2023-05-14 21:20:41 +01:00
ret[title].append(("sitelink", k))
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
for lang, value_list in entity.get("aliases", {}).items():
2021-11-14 08:01:19 +00:00
if lang in skip_lang or len(value_list) > 3:
continue
for name in value_list:
2023-05-14 21:20:41 +01:00
ret[name["value"]].append(("alias", lang))
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
commonscats = entity.get("claims", {}).get("P373", [])
2021-11-14 08:01:19 +00:00
for i in commonscats:
2023-05-14 21:20:41 +01:00
if "datavalue" not in i["mainsnak"]:
2021-11-14 08:01:19 +00:00
continue
2023-05-14 21:20:41 +01:00
value = i["mainsnak"]["datavalue"]["value"]
ret[value].append(("commonscat", None))
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
officialname = entity.get("claims", {}).get("P1448", [])
2021-11-14 08:01:19 +00:00
for i in officialname:
2023-05-14 21:20:41 +01:00
if "datavalue" not in i["mainsnak"]:
2021-11-14 08:01:19 +00:00
continue
2023-05-14 21:20:41 +01:00
value = i["mainsnak"]["datavalue"]["value"]
ret[value["text"]].append(("officialname", value["language"]))
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
nativelabel = entity.get("claims", {}).get("P1705", [])
2021-11-14 08:01:19 +00:00
for i in nativelabel:
2023-05-14 21:20:41 +01:00
if "datavalue" not in i["mainsnak"]:
2021-11-14 08:01:19 +00:00
continue
2023-05-14 21:20:41 +01:00
value = i["mainsnak"]["datavalue"]["value"]
ret[value["text"]].append(("nativelabel", value["language"]))
2021-11-14 08:01:19 +00:00
2023-05-14 21:20:41 +01:00
image = entity.get("claims", {}).get("P18", [])
2021-11-14 08:01:19 +00:00
for i in image:
2023-05-14 21:20:41 +01:00
if "datavalue" not in i["mainsnak"]:
2021-11-14 08:01:19 +00:00
continue
2023-05-14 21:20:41 +01:00
value = i["mainsnak"]["datavalue"]["value"]
m = re.search(r"\.[a-z]{3,4}$", value)
2021-11-14 08:01:19 +00:00
if m:
2023-05-14 21:20:41 +01:00
value = value[: m.start()]
for pattern in r" - geograph\.org\.uk - \d+$", r"[, -]*0\d{2,}$":
2021-11-14 08:01:19 +00:00
m = re.search(pattern, value)
if m:
2023-05-14 21:20:41 +01:00
value = value[: m.start()]
2021-11-14 08:01:19 +00:00
break
2023-05-14 21:20:41 +01:00
ret[value].append(("image", None))
2021-11-14 08:01:19 +00:00
return ret