owl-map/matcher/wikidata.py

127 lines
3.6 KiB
Python

from collections import defaultdict
import re
hq_pid = "P159"
coords_pid = "P625"
def read_coords(snak):
try:
v = snak["datavalue"]["value"]
except KeyError:
return
if v["globe"].rpartition("/")[2] != "Q2":
return
return {k: v[k] for k in ("latitude", "longitude")}
def read_hq_coords(claims):
if hq_pid not in claims:
return []
found = []
for hq_claim in claims[hq_pid]:
if "qualifiers" not in hq_claim:
continue
if coords_pid not in hq_claim["qualifiers"]:
continue
for snak in hq_claim["qualifiers"][coords_pid]:
coords = read_coords(snak)
if coords:
found.append(coords)
return found
def read_location_statement(claims, pid):
if pid not in claims:
return []
found = []
for statement in claims[pid]:
coords = read_coords(statement["mainsnak"])
if coords:
found.append(coords)
return found
def get_entity_coords(claims):
assert "claims" not in claims # make sure we weren't passed entity by mistake
ret = {
coords_pid: read_location_statement(claims, coords_pid),
hq_pid: read_hq_coords(claims),
}
return {pid: values for pid, values in ret.items() if values}
def names_from_entity(entity, skip_lang=None):
if skip_lang is None:
skip_lang = set()
ret = defaultdict(list)
cat_start = 'Category:'
for k, v in entity['labels'].items():
if k in skip_lang:
continue
ret[v['value']].append(('label', k))
for k, v in entity['sitelinks'].items():
if k + 'wiki' in skip_lang:
continue
title = v['title']
if title.startswith(cat_start):
title = title[len(cat_start):]
first_letter = title[0]
if first_letter.isupper():
lc_first_title = first_letter.lower() + title[1:]
if lc_first_title in ret:
title = lc_first_title
ret[title].append(('sitelink', k))
for lang, value_list in entity.get('aliases', {}).items():
if lang in skip_lang or len(value_list) > 3:
continue
for name in value_list:
ret[name['value']].append(('alias', lang))
commonscats = entity.get('claims', {}).get('P373', [])
for i in commonscats:
if 'datavalue' not in i['mainsnak']:
continue
value = i['mainsnak']['datavalue']['value']
ret[value].append(('commonscat', None))
officialname = entity.get('claims', {}).get('P1448', [])
for i in officialname:
if 'datavalue' not in i['mainsnak']:
continue
value = i['mainsnak']['datavalue']['value']
ret[value['text']].append(('officialname', value['language']))
nativelabel = entity.get('claims', {}).get('P1705', [])
for i in nativelabel:
if 'datavalue' not in i['mainsnak']:
continue
value = i['mainsnak']['datavalue']['value']
ret[value['text']].append(('nativelabel', value['language']))
image = entity.get('claims', {}).get('P18', [])
for i in image:
if 'datavalue' not in i['mainsnak']:
continue
value = i['mainsnak']['datavalue']['value']
m = re.search(r'\.[a-z]{3,4}$', value)
if m:
value = value[:m.start()]
for pattern in r' - geograph\.org\.uk - \d+$', r'[, -]*0\d{2,}$':
m = re.search(pattern, value)
if m:
value = value[:m.start()]
break
ret[value].append(('image', None))
return ret