owl-map/matcher/api.py

1187 lines
35 KiB
Python
Raw Normal View History

2023-05-14 22:04:26 +01:00
import collections
2023-05-13 20:57:58 +01:00
import json
import os.path
import re
2023-05-14 21:40:16 +01:00
import typing
2023-05-13 20:57:58 +01:00
2023-05-14 21:42:08 +01:00
import flask
2023-05-15 16:30:06 +01:00
import geoalchemy2
2023-05-17 17:28:44 +01:00
import sqlalchemy
from sqlalchemy import and_, or_
from sqlalchemy.dialects import postgresql
from sqlalchemy.orm import Mapped
from sqlalchemy.sql import select
2023-05-13 20:57:58 +01:00
from matcher import database, model, wikidata, wikidata_api
from matcher.planet import line, point, polygon
2021-06-25 13:52:42 +01:00
2023-05-14 22:04:26 +01:00
TagsType = dict[str, str]
2021-06-25 13:52:42 +01:00
srid = 4326
2023-05-13 20:57:58 +01:00
re_point = re.compile(r"^POINT\((.+) (.+)\)$")
2021-06-25 13:52:42 +01:00
entity_keys = {"labels", "sitelinks", "aliases", "claims", "descriptions", "lastrevid"}
2021-10-15 09:49:08 +01:00
tag_prefixes = {
"disused",
"was",
"abandoned",
"demolished",
"destroyed",
"ruins",
"historic",
}
2021-06-25 13:52:42 +01:00
2021-11-12 12:17:59 +00:00
# these tags are too generic, so we ignore them
skip_tags = {
"Key:addr",
"Key:addr:street",
"Key:lit",
"Key:image",
"Key:name",
"Key:symbol",
"Key:brand",
}
2023-05-13 20:57:58 +01:00
2023-05-14 19:17:16 +01:00
def get_country_iso3166_1(lat: float, lon: float) -> set[str]:
2023-05-13 20:57:58 +01:00
"""For a given lat/lon return a set of ISO country codes.
2022-04-18 12:26:04 +01:00
Also cache the country code in the global object.
Normally there should be only one country.
"""
2023-05-17 17:28:44 +01:00
point = sqlalchemy.func.ST_SetSRID(sqlalchemy.func.ST_MakePoint(lon, lat), srid)
2023-05-14 19:17:16 +01:00
alpha2_codes: set[str] = set()
2023-05-13 20:57:58 +01:00
q = model.Polygon.query.filter(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Covers(model.Polygon.way, point),
model.Polygon.admin_level == "2",
2023-05-13 20:57:58 +01:00
)
2021-06-25 13:52:42 +01:00
for country in q:
2023-05-14 19:17:16 +01:00
alpha2: str = country.tags.get("ISO3166-1")
2021-06-25 13:52:42 +01:00
if not alpha2:
continue
alpha2_codes.add(alpha2)
2023-05-14 21:42:08 +01:00
flask.g.alpha2_codes = alpha2_codes
2021-06-25 13:52:42 +01:00
return alpha2_codes
2023-05-13 20:57:58 +01:00
def is_street_number_first(lat: float, lon: float) -> bool:
"""Is lat/lon within a country that puts number first in a street address."""
2021-06-25 13:52:42 +01:00
if lat is None or lon is None:
return True
alpha2 = get_country_iso3166_1(lat, lon)
2022-04-18 12:26:04 +01:00
# Incomplete list of countries that put street number first.
alpha2_number_first = {
2023-05-13 20:57:58 +01:00
"GB", # United Kingdom
"IE", # Ireland
"US", # United States
"MX", # Mexico
"CA", # Canada
"FR", # France
"AU", # Australia
"NZ", # New Zealand
"ZA", # South Africa
2022-04-18 12:26:04 +01:00
}
2021-06-25 13:52:42 +01:00
return bool(alpha2_number_first & alpha2)
2023-05-15 16:30:06 +01:00
def make_envelope(bounds: list[float]) -> geoalchemy2.functions.ST_MakeEnvelope:
"""Make en envelope for the given bounds."""
2023-05-17 17:28:44 +01:00
return sqlalchemy.func.ST_MakeEnvelope(*bounds, srid)
2021-06-25 13:52:42 +01:00
2023-05-13 20:57:58 +01:00
def parse_point(point: str) -> tuple[str, str]:
"""Parse point from PostGIS."""
m = re_point.match(point)
assert m
lon, lat = m.groups()
assert lon and lat
return (lon, lat)
2023-05-15 16:30:06 +01:00
def get_bbox_centroid(bbox: list[float]) -> tuple[str, str]:
"""Get centroid of bounding box."""
2021-06-25 13:52:42 +01:00
bbox = make_envelope(bbox)
2023-05-17 17:28:44 +01:00
centroid = database.session.query(
sqlalchemy.func.ST_AsText(sqlalchemy.func.ST_Centroid(bbox))
).scalar()
lon, lat = parse_point(centroid)
2023-05-15 16:30:06 +01:00
return (lat, lon)
2021-06-25 13:52:42 +01:00
2023-05-13 20:57:58 +01:00
2023-05-15 16:30:06 +01:00
def make_envelope_around_point(
lat: float, lon: float, distance: float
) -> geoalchemy2.functions.ST_MakeEnvelope:
2023-05-17 17:28:44 +01:00
"""Make an envelope around a point, the distance parameter specifies the size."""
conn = database.session.connection()
2023-10-31 15:35:18 +00:00
p = sqlalchemy.sql.expression.cast(
sqlalchemy.func.ST_MakePoint(lon, lat), geoalchemy2.Geography
)
2023-05-13 20:57:58 +01:00
s = select(
2023-11-02 09:19:36 +00:00
*[
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_AsText(
sqlalchemy.func.ST_Project(p, distance, sqlalchemy.func.radians(deg))
)
for deg in (0, 90, 180, 270)
2023-05-13 20:57:58 +01:00
]
)
coords = [parse_point(i) for i in conn.execute(s).fetchone()]
north = float(coords[0][1])
east = float(coords[1][0])
south = float(coords[2][1])
west = float(coords[3][0])
2023-05-17 17:28:44 +01:00
return sqlalchemy.func.ST_MakeEnvelope(west, south, east, north, srid)
2023-05-13 20:57:58 +01:00
2023-05-14 22:04:26 +01:00
def drop_way_area(tags: TagsType) -> TagsType:
2023-05-13 20:57:58 +01:00
"""Remove the way_area field from a tags dict."""
2021-07-11 16:14:50 +01:00
if "way_area" in tags:
del tags["way_area"]
return tags
2021-06-25 13:52:42 +01:00
2023-05-13 20:57:58 +01:00
def get_part_of(
table_name: str, src_id: int, bbox: geoalchemy2.functions.ST_MakeEnvelope
) -> list[dict[str, typing.Any]]:
"""Get part of."""
2023-05-13 20:57:58 +01:00
table_map = {"point": point, "line": line, "polygon": polygon}
table_alias = table_map[table_name].alias()
tags: Mapped[postgresql.HSTORE] = polygon.c.tags
2023-05-13 20:57:58 +01:00
s = (
select(
2023-11-02 09:19:36 +00:00
polygon.c.osm_id,
polygon.c.tags,
sqlalchemy.func.ST_Area(sqlalchemy.func.ST_Collect(polygon.c.way)),
2023-05-13 20:57:58 +01:00
)
.where(
and_(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Intersects(bbox, polygon.c.way),
sqlalchemy.func.ST_Covers(polygon.c.way, table_alias.c.way),
2023-05-13 20:57:58 +01:00
table_alias.c.osm_id == src_id,
tags.has_key("name"),
or_(tags.has_key("landuse"), tags.has_key("amenity")),
2023-05-13 20:57:58 +01:00
)
)
.group_by(polygon.c.osm_id, polygon.c.tags)
)
conn = database.session.connection()
2023-05-13 20:57:58 +01:00
return [
{
"type": "way" if osm_id > 0 else "relation",
"id": abs(osm_id),
"tags": tags,
"area": area,
}
for osm_id, tags, area in conn.execute(s)
]
2021-06-25 13:52:42 +01:00
2022-04-18 12:26:04 +01:00
2023-05-13 20:57:58 +01:00
def get_and_save_item(qid: str) -> model.Item | None:
"""Download an item from Wikidata and cache it in the database."""
2021-06-25 13:52:42 +01:00
entity = wikidata_api.get_entity(qid)
entity_qid = entity["id"]
if entity_qid != qid:
2023-05-13 20:57:58 +01:00
print(f"redirect {qid} -> {entity_qid}")
2023-05-14 21:20:28 +01:00
item: model.Item | None = model.Item.query.get(entity_qid[1:])
2021-06-25 13:52:42 +01:00
return item
2021-10-15 09:49:08 +01:00
if "claims" not in entity:
2023-05-13 20:57:58 +01:00
return None
2021-06-25 13:52:42 +01:00
coords = wikidata.get_entity_coords(entity["claims"])
item_id = int(qid[1:])
obj = {k: v for k, v in entity.items() if k in entity_keys}
try:
item = model.Item(item_id=item_id, **obj)
except TypeError:
print(qid)
print(f'{entity["pageid"]=} {entity["ns"]=} {entity["type"]=}')
print(entity.keys())
raise
2023-05-15 16:30:06 +01:00
assert item
2021-06-25 13:52:42 +01:00
item.locations = model.location_objects(coords)
database.session.add(item)
database.session.commit()
return item
2023-05-14 21:40:16 +01:00
def get_isa_count(items: list[model.Item]) -> list[tuple[str, int]]:
2023-05-13 20:57:58 +01:00
"""List of IsA counts."""
2023-05-14 22:04:26 +01:00
isa_count: collections.Counter[str] = collections.Counter()
2021-06-25 13:52:42 +01:00
for item in items:
if not item:
continue
isa_list = item.get_claim("P31")
for isa in isa_list:
if not isa:
print("missing IsA:", item.qid)
continue
assert isinstance(isa, dict) and isinstance(isa["id"], str)
2021-06-25 13:52:42 +01:00
isa_count[isa["id"]] += 1
return isa_count.most_common()
2023-05-15 16:30:06 +01:00
def get_items_in_bbox(bbox: list[float]):
2021-06-25 13:52:42 +01:00
db_bbox = make_envelope(bbox)
q = (
model.Item.query.join(model.ItemLocation)
2023-05-17 17:28:44 +01:00
.filter(sqlalchemy.func.ST_Covers(db_bbox, model.ItemLocation.location))
.options(sqlalchemy.orm.selectinload(model.Item.locations))
2021-06-25 13:52:42 +01:00
)
return q
2021-07-30 15:02:41 +01:00
def get_osm_with_wikidata_tag(bbox, isa_filter=None):
2023-05-13 20:57:58 +01:00
bbox_str = ",".join(str(v) for v in bbox)
2021-07-30 15:02:41 +01:00
extra_sql = ""
if isa_filter:
2023-05-13 20:57:58 +01:00
q = model.Item.query.join(model.ItemLocation).filter(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Covers(make_envelope(bbox), model.ItemLocation.location)
2021-07-30 15:02:41 +01:00
)
q = add_isa_filter(q, isa_filter)
qids = [isa.qid for isa in q]
if not qids:
return []
qid_list = ",".join(f"'{qid}'" for qid in qids)
extra_sql += f" AND tags -> 'wikidata' in ({qid_list})"
# easier than building this query with SQLAlchemy
2023-05-13 20:57:58 +01:00
sql = (
f"""
SELECT tbl, osm_id, tags, ARRAY[ST_Y(centroid), ST_X(centroid)], geojson
FROM (
SELECT 'point' as tbl, osm_id, tags, ST_AsText(ST_Centroid(way)) as centroid, ST_AsGeoJSON(way) as geojson
FROM planet_osm_point
WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way)
UNION
SELECT 'line' as tbl, osm_id, tags, ST_AsText(ST_Centroid(ST_Collect(way))) AS centroid, ST_AsGeoJSON(ST_Collect(way)) AS geojson
FROM planet_osm_line
WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way)
GROUP BY osm_id, tags
UNION
SELECT 'polygon' as tbl, osm_id, tags, ST_AsText(ST_Centroid(ST_Collect(way))) AS centroid, ST_AsGeoJSON(ST_Collect(way)) AS geojson
FROM planet_osm_polygon
WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way)
GROUP BY osm_id, tags
HAVING st_area(st_collect(way)) < 20 * st_area(ST_MakeEnvelope({bbox_str}, {srid}))
) as anon
WHERE tags ? 'wikidata'
2023-05-13 20:57:58 +01:00
"""
+ extra_sql
)
conn = database.session.connection()
2023-05-17 17:28:44 +01:00
result = conn.execute(sqlalchemy.text(sql))
2021-06-25 13:52:42 +01:00
2023-05-14 11:31:56 +01:00
# print(sql)
2021-10-15 09:49:08 +01:00
2023-05-13 20:57:58 +01:00
point_sql = (
f"""
2021-10-15 09:49:08 +01:00
SELECT 'point' as tbl, osm_id, tags, ST_AsText(ST_Centroid(way)) as centroid, ST_AsGeoJSON(way) as geojson
FROM planet_osm_point
WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way) and tags ? 'wikidata'
2023-05-13 20:57:58 +01:00
"""
+ extra_sql
)
2021-10-15 09:49:08 +01:00
print("point")
print(point_sql)
2021-06-25 13:52:42 +01:00
tagged = []
for tbl, osm_id, tags, centroid, geojson in result:
2023-05-13 20:57:58 +01:00
if tbl == "point":
osm_type = "node"
else:
osm_type = "way" if osm_id > 0 else "relation"
osm_id = abs(osm_id)
2021-06-25 13:52:42 +01:00
name = tags.get("name") or tags.get("addr:housename") or "[no label]"
2023-05-13 20:57:58 +01:00
tagged.append(
{
"identifier": f"{osm_type}/{osm_id}",
"id": osm_id,
"type": osm_type,
"geojson": json.loads(geojson),
"centroid": centroid,
"name": name,
"wikidata": tags["wikidata"],
}
)
2021-06-25 13:52:42 +01:00
return tagged
2023-05-14 21:29:24 +01:00
def get_items(item_ids: list[int]) -> list[model.Item]:
"""Get a Wikidata items with the given item IDs."""
2021-06-25 13:52:42 +01:00
items = []
for item_id in item_ids:
item = model.Item.query.get(item_id)
if not item:
if not get_and_save_item(f"Q{item_id}"):
continue
item = model.Item.query.get(item_id)
items.append(item)
return items
2023-05-14 21:40:16 +01:00
class IsaPath(typing.TypedDict):
"""Component of an IsA path."""
qid: str
label: str
2023-05-14 21:29:24 +01:00
def get_item_tags(item: model.Item) -> dict[str, list[str]]:
2023-05-14 21:40:16 +01:00
isa_list: list[int] = [typing.cast(int, v["numeric-id"]) for v in item.get_isa()]
isa_items: list[tuple[model.Item, list[IsaPath]]] = [
(isa, []) for isa in get_items(isa_list)
]
2021-06-25 13:52:42 +01:00
2023-05-14 22:04:26 +01:00
osm_list = collections.defaultdict(list)
2021-06-25 13:52:42 +01:00
2023-05-17 17:28:44 +01:00
skip_isa: set[int] = {
row[0] for row in database.session.query(model.SkipIsA.item_id)
}
2021-10-15 09:49:08 +01:00
tram_stop_id = 41176
airport_id = 1248784
aerodrome_id = 62447
if {tram_stop_id, airport_id, aerodrome_id} & set(isa_list):
skip_isa.add(41176) # building (Q41176)
2021-06-25 13:52:42 +01:00
2023-05-17 17:28:44 +01:00
seen: set[int] = set(isa_list) | skip_isa
stop = {
"Q11799049": "public institution",
"Q7075": "library",
"Q329683": "industrial park",
}
2021-06-25 13:52:42 +01:00
while isa_items:
isa, isa_path = isa_items.pop()
2021-06-25 13:52:42 +01:00
if not isa:
continue
2023-05-14 21:40:16 +01:00
isa_qid: str = typing.cast(str, isa.qid)
isa_path = isa_path + [{"qid": isa_qid, "label": isa.label()}]
2023-05-17 17:28:44 +01:00
osm: list[str] = [
typing.cast(str, v) for v in isa.get_claim("P1282") if v not in skip_tags
]
2021-10-15 09:49:08 +01:00
2023-05-13 20:57:58 +01:00
osm += [
extra.tag_or_key
for extra in model.ItemExtraKeys.query.filter_by(item_id=isa.item_id)
]
2021-06-25 13:52:42 +01:00
for i in osm:
osm_list[i].append(isa_path[:])
2021-06-25 13:52:42 +01:00
2023-05-14 21:40:16 +01:00
if isa_qid in stop:
# item is specific enough, no need to keep walking the item hierarchy
continue
2023-05-17 17:28:44 +01:00
check: set[int] = set()
2021-11-13 10:44:56 +00:00
properties = [
("P279", "subclass of"),
("P140", "religion"),
("P641", "sport"),
("P366", "use"),
("P1269", "facet of"),
# ("P361", "part of"),
]
for pid, label in properties:
2023-05-17 17:28:44 +01:00
check |= {
typing.cast(dict[str, int], v)["numeric-id"]
for v in (isa.get_claim(pid) or [])
if v
}
2021-11-13 10:44:56 +00:00
print(isa.qid, isa.label(), check)
2023-05-17 17:28:44 +01:00
isa_list_set = check - seen
seen.update(isa_list_set)
2023-05-20 07:36:59 +01:00
isa_items += [(isa, isa_path) for isa in get_items(isa_list_set)]
return {key: list(values) for key, values in osm_list.items()}
2021-06-25 13:52:42 +01:00
2021-11-13 10:44:56 +00:00
def get_tags_for_isa_item(item):
isa_list = [item.item_id]
isa_items = [(item, [])]
2023-05-14 22:04:26 +01:00
osm_list = collections.defaultdict(list)
2021-11-13 10:44:56 +00:00
skip_isa = {row[0] for row in database.session.query(model.SkipIsA.item_id)}
tram_stop_id = 41176
airport_id = 1248784
aerodrome_id = 62447
if {tram_stop_id, airport_id, aerodrome_id} & set(isa_list):
skip_isa.add(41176) # building (Q41176)
seen = set(isa_list) | skip_isa
stop = {
"Q11799049": "public institution",
"Q7075": "library",
"Q329683": "industrial park",
}
items_checked = []
items_checked_done = set()
while isa_items:
isa, isa_path = isa_items.pop()
if not isa:
continue
2023-05-13 20:57:58 +01:00
isa_path = isa_path + [{"qid": isa.qid, "label": isa.label()}]
2021-11-13 10:44:56 +00:00
if isa.item_id not in items_checked_done:
2023-05-13 20:57:58 +01:00
items_checked.append({"qid": isa.qid, "label": isa.label()})
2021-11-13 10:44:56 +00:00
items_checked_done.add(isa.item_id)
osm = [v for v in isa.get_claim("P1282") if v not in skip_tags]
2023-05-13 20:57:58 +01:00
osm += [
extra.tag_or_key
for extra in model.ItemExtraKeys.query.filter_by(item_id=isa.item_id)
]
2021-11-13 10:44:56 +00:00
for i in osm:
osm_list[i].append(isa_path[:])
if isa.qid in stop:
# item is specific enough, no need to keep walking the item hierarchy
continue
check = set()
properties = [
("P279", "subclass of"),
("P140", "religion"),
("P641", "sport"),
("P366", "use"),
("P1269", "facet of"),
# ("P361", "part of"),
]
for pid, label in properties:
check |= {v["numeric-id"] for v in (isa.get_claim(pid) or []) if v}
print(isa.qid, isa.label(), check)
isa_list = check - seen
seen.update(isa_list)
isa_items += [(isa, isa_path) for isa in get_items(isa_list)]
return {
2023-05-13 20:57:58 +01:00
"tags": {key: list(values) for key, values in osm_list.items()},
"checked": items_checked,
2021-11-13 10:44:56 +00:00
}
2021-07-30 15:02:41 +01:00
def add_isa_filter(q, isa_qids):
q_subclass = database.session.query(model.Item.qid).filter(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.jsonb_path_query_array(
2021-07-30 15:02:41 +01:00
model.Item.claims,
2023-05-13 20:57:58 +01:00
"$.P279[*].mainsnak.datavalue.value.id",
).bool_op("?|")(list(isa_qids))
2021-07-30 15:02:41 +01:00
)
subclass_qid = {qid for qid, in q_subclass.all()}
2023-05-17 17:28:44 +01:00
isa = sqlalchemy.func.jsonb_path_query_array(
2021-07-30 15:02:41 +01:00
model.Item.claims,
2023-05-13 20:57:58 +01:00
"$.P31[*].mainsnak.datavalue.value.id",
).bool_op("?|")
2021-07-30 15:02:41 +01:00
return q.filter(isa(list(isa_qids | subclass_qid)))
def wikidata_items_count(bounds, isa_filter=None):
2023-05-13 20:57:58 +01:00
q = model.Item.query.join(model.ItemLocation).filter(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Covers(make_envelope(bounds), model.ItemLocation.location)
2021-06-25 13:52:42 +01:00
)
2021-07-30 15:02:41 +01:00
if isa_filter:
q = add_isa_filter(q, isa_filter)
2021-10-15 09:49:08 +01:00
# print(q.statement.compile(compile_kwargs={"literal_binds": True}))
2021-07-30 15:02:41 +01:00
2021-06-25 13:52:42 +01:00
return q.count()
2023-05-13 20:57:58 +01:00
def wikidata_isa_counts(bounds, isa_filter=None):
2021-06-25 13:52:42 +01:00
db_bbox = make_envelope(bounds)
2023-05-13 20:57:58 +01:00
q = model.Item.query.join(model.ItemLocation).filter(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Covers(db_bbox, model.ItemLocation.location)
2021-06-25 13:52:42 +01:00
)
if isa_filter:
q = add_isa_filter(q, isa_filter)
2021-06-25 13:52:42 +01:00
db_items = q.all()
counts = get_isa_count(db_items)
isa_ids = [qid[1:] for qid, count in counts]
isa_items = {
isa.qid: isa for isa in model.Item.query.filter(model.Item.item_id.in_(isa_ids))
}
isa_count = []
for qid, count in counts:
item = isa_items.get(qid)
if not item:
item = get_and_save_item(qid)
label = item.label() if item else "[missing]"
isa = {
"qid": qid,
"count": count,
"label": label,
}
isa_count.append(isa)
return isa_count
2023-05-13 20:57:58 +01:00
2023-05-17 17:28:44 +01:00
def get_tag_filter(
tags: sqlalchemy.sql.schema.Column, tag_list: list[str]
) -> list[sqlalchemy.sql.elements.BooleanClauseList]:
2021-06-25 13:52:42 +01:00
tag_filter = []
include_prefix = len(tag_list) < 10
2021-06-25 13:52:42 +01:00
for tag_or_key in tag_list:
if tag_or_key.startswith("Key:"):
2021-10-15 09:49:08 +01:00
key = tag_or_key[4:]
2023-05-13 20:57:58 +01:00
tag_filter.append(and_(tags.has_key(key), tags[key] != "no"))
if include_prefix:
for prefix in tag_prefixes:
tag_filter.append(tags.has_key(f"{prefix}:{key}"))
2021-10-15 09:49:08 +01:00
2021-06-25 13:52:42 +01:00
if tag_or_key.startswith("Tag:"):
2021-10-15 09:49:08 +01:00
k, _, v = tag_or_key[4:].partition("=")
tag_filter.append(tags[k] == v)
if include_prefix:
for prefix in tag_prefixes:
tag_filter.append(tags[f"{prefix}:{k}"] == v)
2021-06-25 13:52:42 +01:00
return tag_filter
2023-05-17 17:28:44 +01:00
def get_preset_translations() -> dict[str, typing.Any]:
2023-05-14 21:42:08 +01:00
app = flask.current_app
2021-06-25 13:52:42 +01:00
country_language = {
2023-05-13 20:57:58 +01:00
"AU": "en-AU", # Australia
"GB": "en-GB", # United Kingdom
"IE": "en-GB", # Ireland
"IN": "en-IN", # India
"NZ": "en-NZ", # New Zealand
2021-06-25 13:52:42 +01:00
}
ts_dir = app.config["ID_TAGGING_SCHEMA_DIR"]
translation_dir = os.path.join(ts_dir, "dist", "translations")
2023-05-14 21:42:08 +01:00
for code in flask.g.alpha2_codes:
2021-10-15 09:49:08 +01:00
lang_code = country_language.get("code")
if not lang_code:
2021-06-25 13:52:42 +01:00
continue
2021-10-15 09:49:08 +01:00
filename = os.path.join(translation_dir, lang_code + ".json")
json_data = json.load(open(filename))
if lang_code not in json_data:
continue
try:
2023-05-17 17:28:44 +01:00
return typing.cast(
dict[str, typing.Any], json_data[lang_code]["presets"]["presets"]
)
2021-10-15 09:49:08 +01:00
except KeyError:
pass
2021-06-25 13:52:42 +01:00
return {}
2023-05-13 20:57:58 +01:00
2023-05-14 22:04:26 +01:00
def get_presets_from_tags(ending: str, tags: TagsType) -> list[dict[str, typing.Any]]:
2021-06-25 13:52:42 +01:00
translations = get_preset_translations()
2023-05-14 22:04:26 +01:00
found: list[dict[str, typing.Any]] = []
2021-06-25 13:52:42 +01:00
for k, v in tags.items():
2023-05-13 20:57:58 +01:00
if k == "amenity" and v == "clock" and tags.get("display") == "sundial":
2021-06-25 13:52:42 +01:00
tag_or_key = f"Tag:{k}={v}"
found.append({"tag_or_key": tag_or_key, "name": "Sundial"})
continue
match = find_preset_file(k, v, ending)
if not match:
continue
preset = match["preset"]
if preset in translations:
match["name"] = translations[preset]["name"]
else:
match["name"] = json.load(open(match["filename"]))["name"]
del match["filename"]
found.append(match)
return found
2023-05-14 22:04:26 +01:00
def find_preset_file(k: str, v: str, ending: str) -> dict[str, str] | None:
"""Find preset file."""
2023-05-14 21:42:08 +01:00
app = flask.current_app
2021-06-25 13:52:42 +01:00
ts_dir = app.config["ID_TAGGING_SCHEMA_DIR"]
preset_dir = os.path.join(ts_dir, "data", "presets")
filename = os.path.join(preset_dir, k, v + ".json")
if os.path.exists(filename):
return {
"tag_or_key": f"Tag:{k}={v}",
"preset": f"{k}/{v}",
"filename": filename,
}
filename = os.path.join(preset_dir, k, f"{v}_{ending}.json")
if os.path.exists(filename):
return {
"tag_or_key": f"Tag:{k}={v}",
"preset": f"{k}/{v}",
"filename": filename,
}
filename = os.path.join(preset_dir, k, "_" + v + ".json")
if os.path.exists(filename):
return {
"tag_or_key": f"Tag:{k}={v}",
"preset": f"{k}/{v}",
"filename": filename,
}
filename = os.path.join(preset_dir, k + ".json")
if os.path.exists(filename):
return {
"tag_or_key": f"Key:{k}",
"preset": k,
"filename": filename,
}
2023-05-14 22:04:26 +01:00
return None
2021-06-25 13:52:42 +01:00
2023-05-14 22:04:26 +01:00
def address_from_tags(tags: TagsType) -> str | None:
"""Build list of addresses based on OSM tags."""
2021-06-25 13:52:42 +01:00
keys = ["street", "housenumber"]
if not all("addr:" + k in tags for k in keys):
2023-05-14 22:04:26 +01:00
return None
2021-06-25 13:52:42 +01:00
2023-05-14 21:42:08 +01:00
if flask.g.street_number_first:
2021-06-25 13:52:42 +01:00
keys.reverse()
return " ".join(tags["addr:" + k] for k in keys)
2023-05-14 22:04:26 +01:00
def address_node_label(tags: TagsType) -> str | None:
"""Label for an OSM node, based on tags."""
address = address_from_tags(tags)
return f"{tags['name']} ({address})" if "name" in tags else address
2021-06-25 13:52:42 +01:00
def get_address_nodes_within_building(osm_id, bbox_list):
q = model.Point.query.filter(
polygon.c.osm_id == osm_id,
2023-05-17 17:28:44 +01:00
or_(
*[
sqlalchemy.func.ST_Intersects(bbox, model.Point.way)
for bbox in bbox_list
]
),
sqlalchemy.func.ST_Covers(polygon.c.way, model.Point.way),
model.Point.tags.has_key("addr:street"),
model.Point.tags.has_key("addr:housenumber"),
)
return [node.tags for node in q]
2023-05-14 22:05:07 +01:00
def osm_display_name(tags: TagsType) -> str | None:
2023-05-14 22:04:26 +01:00
"""Get name to display from OSM tags."""
2023-05-13 20:57:58 +01:00
keys = (
"bridge:name",
"tunnel:name",
"lock_name",
"name",
"addr:housename",
"inscription",
)
2023-05-14 22:04:26 +01:00
return next((tags[key] for key in keys if key in tags), None)
def street_address_in_tags(tags):
return "addr:housenumber" in tags and "addr:street" in tags
2023-05-13 20:57:58 +01:00
2021-10-15 09:49:08 +01:00
def find_osm_candidates(item, limit=80, max_distance=450, names=None):
item_id = item.item_id
2021-10-15 09:49:08 +01:00
item_is_linear_feature = item.is_linear_feature()
item_is_street = item.is_street()
2022-04-08 10:44:39 +01:00
item_names_dict = item.names()
if item_names_dict:
item_names = {n.lower() for n in item_names_dict.keys()}
else:
item_names = set()
2021-06-25 13:52:42 +01:00
check_is_street_number_first(item.locations[0].get_lat_lon())
2021-06-25 13:52:42 +01:00
2023-05-13 20:57:58 +01:00
bbox_list = [
make_envelope_around_point(*loc.get_lat_lon(), max_distance)
for loc in item.locations
]
2023-05-17 17:28:44 +01:00
null_area = sqlalchemy.sql.expression.cast(None, sqlalchemy.types.Float)
dist = sqlalchemy.sql.expression.column("dist")
tags = sqlalchemy.sql.expression.column(
"tags", sqlalchemy.dialects.postgresql.HSTORE
)
tag_list = get_item_tags(item)
2023-05-13 20:57:58 +01:00
s_point = (
select(
2023-11-02 09:19:36 +00:00
sqlalchemy.sql.expression.literal("point").label("t"),
point.c.osm_id,
point.c.tags.label("tags"),
sqlalchemy.func.min(
sqlalchemy.func.ST_DistanceSphere(
model.ItemLocation.location, point.c.way
)
).label("dist"),
sqlalchemy.func.ST_AsText(point.c.way),
sqlalchemy.func.ST_AsGeoJSON(point.c.way),
null_area,
2023-05-13 20:57:58 +01:00
)
.where(
and_(
2023-05-17 17:28:44 +01:00
or_(
*[
sqlalchemy.func.ST_Intersects(bbox, point.c.way)
for bbox in bbox_list
]
),
2023-05-13 20:57:58 +01:00
model.ItemLocation.item_id == item_id,
or_(*get_tag_filter(point.c.tags, tag_list)),
)
)
.group_by(point.c.osm_id, point.c.tags, point.c.way)
)
s_line = (
select(
2023-11-02 09:19:36 +00:00
sqlalchemy.sql.expression.literal("line").label("t"),
line.c.osm_id,
line.c.tags.label("tags"),
sqlalchemy.func.min(
sqlalchemy.func.ST_DistanceSphere(
model.ItemLocation.location, line.c.way
)
).label("dist"),
sqlalchemy.func.ST_AsText(
sqlalchemy.func.ST_Centroid(sqlalchemy.func.ST_Collect(line.c.way))
),
sqlalchemy.func.ST_AsGeoJSON(sqlalchemy.func.ST_Collect(line.c.way)),
null_area,
2023-05-13 20:57:58 +01:00
)
.where(
and_(
2023-05-17 17:28:44 +01:00
or_(
*[
sqlalchemy.func.ST_Intersects(bbox, line.c.way)
for bbox in bbox_list
]
),
2023-05-13 20:57:58 +01:00
model.ItemLocation.item_id == item_id,
or_(*get_tag_filter(line.c.tags, tag_list)),
)
)
.group_by(line.c.osm_id, line.c.tags)
)
s_polygon = (
select(
2023-11-02 09:19:36 +00:00
sqlalchemy.sql.expression.literal("polygon").label("t"),
polygon.c.osm_id,
polygon.c.tags.label("tags"),
sqlalchemy.func.min(
sqlalchemy.func.ST_DistanceSphere(
model.ItemLocation.location, polygon.c.way
)
).label("dist"),
sqlalchemy.func.ST_AsText(
sqlalchemy.func.ST_Centroid(sqlalchemy.func.ST_Collect(polygon.c.way))
),
sqlalchemy.func.ST_AsGeoJSON(sqlalchemy.func.ST_Collect(polygon.c.way)),
sqlalchemy.func.ST_Area(sqlalchemy.func.ST_Collect(polygon.c.way)),
2023-05-13 20:57:58 +01:00
)
.where(
and_(
2023-05-17 17:28:44 +01:00
or_(
*[
sqlalchemy.func.ST_Intersects(bbox, polygon.c.way)
for bbox in bbox_list
]
),
2023-05-13 20:57:58 +01:00
model.ItemLocation.item_id == item_id,
or_(*get_tag_filter(polygon.c.tags, tag_list)),
)
)
.group_by(polygon.c.osm_id, polygon.c.tags)
.having(
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Area(sqlalchemy.func.ST_Collect(polygon.c.way))
< 20 * sqlalchemy.func.ST_Area(bbox_list[0])
2023-05-13 20:57:58 +01:00
)
)
2021-10-15 09:49:08 +01:00
tables = ([] if item_is_linear_feature else [s_point]) + [s_line, s_polygon]
2023-05-17 17:28:44 +01:00
s = (
2023-11-02 09:19:36 +00:00
select(sqlalchemy.sql.expression.union(*tables).alias())
2023-05-17 17:28:44 +01:00
.where(dist < max_distance)
.order_by(dist)
)
if names:
2021-10-15 09:49:08 +01:00
s = s.where(or_(tags["name"].in_(names), tags["old_name"].in_(names)))
if item_is_street:
s = s.where(tags["highway"] != "bus_stop")
if not names:
s = s.where(tags.has_key("name"))
if "Key:amenity" in tag_list:
2023-05-13 20:57:58 +01:00
s = s.where(
and_(
tags["amenity"] != "bicycle_parking",
tags["amenity"] != "bicycle_repair_station",
tags["amenity"] != "atm",
tags["amenity"] != "recycling",
)
)
if limit:
s = s.limit(limit)
2023-05-14 11:31:56 +01:00
# print(s.compile(compile_kwargs={"literal_binds": True}))
conn = database.session.connection()
2021-06-25 13:52:42 +01:00
nearby = []
for table, src_id, tags, distance, centroid, geojson, area in conn.execute(s):
osm_id = src_id
if table == "point":
osm_type = "node"
elif osm_id > 0:
osm_type = "way"
else:
osm_type = "relation"
osm_id = -osm_id
2021-06-25 13:52:42 +01:00
tags.pop("way_area", None)
name = osm_display_name(tags)
if not name and street_address_in_tags(tags):
2021-06-25 13:52:42 +01:00
name = address_from_tags(tags)
if table == "polygon" and "building" in tags:
address_nodes = get_address_nodes_within_building(src_id, bbox_list)
address_list = [address_node_label(addr) for addr in address_nodes]
2021-06-25 13:52:42 +01:00
else:
address_list = []
shape = "area" if table == "polygon" else table
2023-05-13 14:01:28 +01:00
item_identifier_tags = item.get_identifiers_tags()
2021-06-25 13:52:42 +01:00
cur = {
"identifier": f"{osm_type}/{osm_id}",
"type": osm_type,
"id": osm_id,
"distance": distance,
2021-06-25 13:52:42 +01:00
"name": name,
"name_match": (name and name.lower() in item_names),
2021-06-25 13:52:42 +01:00
"tags": tags,
"geojson": json.loads(geojson),
"presets": get_presets_from_tags(shape, tags),
2021-06-25 13:52:42 +01:00
"address_list": address_list,
"centroid": list(reversed(parse_point(centroid))),
2021-06-25 13:52:42 +01:00
}
if area is not None:
cur["area"] = area
2021-06-25 13:52:42 +01:00
part_of = []
for bbox in bbox_list:
2023-05-13 20:57:58 +01:00
part_of += [
i for i in get_part_of(table, src_id, bbox) if i["tags"]["name"] != name
]
2021-06-25 13:52:42 +01:00
if part_of:
cur["part_of"] = part_of
2023-05-14 22:04:26 +01:00
if address := address_from_tags(typing.cast(TagsType, tags)):
cur["address"] = address
2021-06-25 13:52:42 +01:00
nearby.append(cur)
return nearby
2022-04-18 12:26:04 +01:00
2023-05-14 22:04:26 +01:00
def get_item(item_id: int) -> model.Item | None:
2023-05-13 20:57:58 +01:00
"""Retrieve a Wikidata item, either from the database or from Wikidata."""
2021-06-25 13:52:42 +01:00
item = model.Item.query.get(item_id)
return item or get_and_save_item(f"Q{item_id}")
2021-06-25 13:52:42 +01:00
2023-05-14 22:04:26 +01:00
def get_item_street_addresses(item: model.Item) -> list[str]:
"""Hunt for street addresses for the given item."""
street_address = [addr["text"] for addr in item.get_claim("P6375") if addr]
2021-06-25 13:52:42 +01:00
if street_address or "P669" not in item.claims:
return street_address
2023-05-14 22:04:26 +01:00
assert isinstance(item.claims, dict)
claims: wikidata.Claims = item.claims
for claim in claims["P669"]:
2021-06-25 13:52:42 +01:00
qualifiers = claim.get("qualifiers")
2023-05-13 20:57:58 +01:00
if not qualifiers or "P670" not in qualifiers:
2021-06-25 13:52:42 +01:00
continue
2024-06-19 14:32:51 +01:00
if "datavalue" not in qualifiers["P670"]:
print(f"datavalue missing in P670 for {item.qid}")
continue
2021-06-25 13:52:42 +01:00
number = qualifiers["P670"][0]["datavalue"]["value"]
street_item = get_item(claim["mainsnak"]["datavalue"]["value"]["numeric-id"])
2023-05-14 22:04:26 +01:00
assert street_item
2021-06-25 13:52:42 +01:00
street = street_item.label()
for q in qualifiers["P670"]:
number = q["datavalue"]["value"]
2023-05-13 20:57:58 +01:00
address = (
2023-05-14 21:42:08 +01:00
f"{number} {street}"
if flask.g.street_number_first
else f"{street} {number}"
2023-05-13 20:57:58 +01:00
)
2021-06-25 13:52:42 +01:00
street_address.append(address)
return street_address
2023-05-13 20:57:58 +01:00
def check_is_street_number_first(latlng):
2023-05-14 21:42:08 +01:00
flask.g.street_number_first = is_street_number_first(*latlng)
2021-06-25 13:52:42 +01:00
2023-05-13 20:57:58 +01:00
class ItemDetailType(typing.TypedDict, total=False):
2023-10-31 14:41:59 +00:00
"""Details of an item as a dict."""
qid: str
label: str
description: str | None
2023-10-31 14:41:59 +00:00
markers: list[dict[str, float]]
image_list: list[str]
street_address: list[str]
isa_list: list[dict[str, str]]
closed: list[str]
2023-10-31 14:41:59 +00:00
inception: str
p1619: str
p576: str
heritage_designation: str
wikipedia: list[dict[str, str]]
identifiers: dict[str, list[str]]
2023-10-31 14:41:59 +00:00
def item_detail(item: model.Item) -> ItemDetailType:
"""Get detail for an item, returns a dict."""
2022-04-08 10:44:39 +01:00
unsupported_relation_types = {
2023-05-13 20:57:58 +01:00
"Q194356", # wind farm
"Q2175765", # tram stop
2022-04-08 10:44:39 +01:00
}
locations = [list(i.get_lat_lon()) for i in item.locations]
2023-05-14 21:42:08 +01:00
if not hasattr(flask.g, "street_number_first"):
flask.g.street_number_first = is_street_number_first(*locations[0])
image_filenames = item.get_claim("P18")
street_address = get_item_street_addresses(item)
2021-10-15 09:49:08 +01:00
heritage_designation = []
for v in item.get_claim("P1435"):
if not v:
2023-05-13 20:57:58 +01:00
print("heritage designation missing:", item.qid)
2021-10-15 09:49:08 +01:00
continue
heritage_designation_item = get_item(v["numeric-id"])
2023-05-13 20:57:58 +01:00
heritage_designation.append(
{
"qid": v["id"],
"label": heritage_designation_item.label(),
}
)
isa_items = [get_item(isa["numeric-id"]) for isa in item.get_isa()]
2024-05-04 08:02:30 +01:00
isa_lookup = {isa.qid: isa for isa in isa_items if isa}
2022-04-08 10:44:39 +01:00
2023-05-13 20:57:58 +01:00
wikipedia_links = [
{"lang": site[:-4], "title": link["title"]}
for site, link in sorted(item.sitelinks.items())
if site.endswith("wiki") and len(site) < 8
]
d: ItemDetailType = {
"qid": item.qid,
"label": item.label(),
"description": item.description(),
"markers": locations,
"image_list": image_filenames,
"street_address": street_address,
2023-05-13 20:57:58 +01:00
"isa_list": [
{"qid": isa.qid, "label": isa.label()} for isa in isa_items if isa
],
"closed": item.closed(),
2021-11-19 15:43:18 +00:00
"inception": item.time_claim("P571"),
"p1619": item.time_claim("P1619"),
"p576": item.time_claim("P576"),
"heritage_designation": heritage_designation,
2022-04-08 10:44:39 +01:00
"wikipedia": wikipedia_links,
"identifiers": item.get_identifiers(),
}
2021-06-25 13:52:42 +01:00
if aliases := item.get_aliases():
d["aliases"] = aliases
2021-06-25 13:52:42 +01:00
2022-04-08 10:44:39 +01:00
if "commonswiki" in item.sitelinks:
d["commons"] = item.sitelinks["commonswiki"]["title"]
unsupported = isa_lookup.keys() & unsupported_relation_types
if unsupported:
2023-05-13 20:57:58 +01:00
d["unsupported_relation_types"] = [
isa for isa in d["isa_list"] if isa["qid"] in isa_lookup
]
2022-04-08 10:44:39 +01:00
return d
2021-06-25 13:52:42 +01:00
def get_markers(all_items):
return [item_detail(item) for item in all_items if item]
2021-06-25 13:52:42 +01:00
2021-07-30 15:02:41 +01:00
def wikidata_items(bounds, isa_filter=None):
check_is_street_number_first(get_bbox_centroid(bounds))
2021-06-25 13:52:42 +01:00
q = get_items_in_bbox(bounds)
2021-07-30 15:02:41 +01:00
if isa_filter:
q = add_isa_filter(q, isa_filter)
2021-06-25 13:52:42 +01:00
db_items = q.all()
items = get_markers(db_items)
counts = get_isa_count(db_items)
isa_ids = [qid[1:] for qid, count in counts]
isa_items = {
isa.qid: isa for isa in model.Item.query.filter(model.Item.item_id.in_(isa_ids))
}
isa_count = []
for qid, count in counts:
item = isa_items.get(qid)
if not item:
item = get_and_save_item(qid)
label = item.label() if item else "[missing]"
isa = {
"qid": qid,
"count": count,
"label": label,
}
isa_count.append(isa)
2023-05-13 20:57:58 +01:00
return {"items": items, "isa_count": isa_count}
2021-06-25 13:52:42 +01:00
def missing_wikidata_items(qids, lat, lon):
2023-05-14 21:42:08 +01:00
flask.g.street_number_first = is_street_number_first(lat, lon)
2021-06-25 13:52:42 +01:00
db_items = []
for qid in qids:
item = model.Item.query.get(qid[1:])
if not item:
item = get_and_save_item(qid)
db_items.append(item)
items = get_markers(db_items)
counts = get_isa_count(db_items)
isa_ids = [qid[1:] for qid, count in counts]
isa_items = {
isa.qid: isa for isa in model.Item.query.filter(model.Item.item_id.in_(isa_ids))
}
isa_count = []
for qid, count in counts:
item = isa_items.get(qid)
if not item:
item = get_and_save_item(qid)
label = item.label() if item else "[missing]"
isa = {
"qid": qid,
"count": count,
"label": label,
}
isa_count.append(isa)
2023-10-31 14:41:59 +00:00
return {"items": items, "isa_count": isa_count}
2021-07-30 15:02:41 +01:00
2023-05-13 20:57:58 +01:00
2023-05-17 17:28:44 +01:00
def isa_incremental_search(search_terms: str) -> list[dict[str, str]]:
"""Incremental search."""
en_label = sqlalchemy.func.jsonb_extract_path_text(model.Item.labels, "en", "value")
2021-07-30 15:02:41 +01:00
q = model.Item.query.filter(
2023-05-13 20:57:58 +01:00
model.Item.claims.has_key("P1282"),
en_label.ilike(f"%{search_terms}%"),
2023-05-17 17:28:44 +01:00
sqlalchemy.func.length(en_label) < 20,
2021-07-30 15:02:41 +01:00
)
2023-05-14 11:31:56 +01:00
# print(q.statement.compile(compile_kwargs={"literal_binds": True}))
2021-10-15 09:49:08 +01:00
2023-05-17 17:28:44 +01:00
return [
{
2021-07-30 15:02:41 +01:00
"qid": item.qid,
"label": item.label(),
}
2023-05-17 17:28:44 +01:00
for item in q
]
2022-04-18 12:24:16 +01:00
2023-05-13 20:57:58 +01:00
2023-05-14 22:04:26 +01:00
class PlaceItems(typing.TypedDict):
"""Place items."""
count: int
2023-05-15 16:30:06 +01:00
items: list[dict[str, typing.Any]]
2023-05-14 22:04:26 +01:00
2023-05-15 16:30:06 +01:00
def get_place_items(osm_type: str, osm_id: int) -> PlaceItems:
2023-05-14 22:04:26 +01:00
"""Return place items for given osm_type and osm_id."""
2023-05-13 20:57:58 +01:00
src_id = osm_id * {"way": 1, "relation": -1}[osm_type]
2022-04-18 12:24:16 +01:00
2023-05-13 20:57:58 +01:00
q = (
model.Item.query.join(model.ItemLocation)
.join(
model.Polygon,
2023-05-17 17:28:44 +01:00
sqlalchemy.func.ST_Covers(model.Polygon.way, model.ItemLocation.location),
2023-05-13 20:57:58 +01:00
)
.filter(model.Polygon.src_id == src_id)
)
2022-04-18 12:24:16 +01:00
# sql = q.statement.compile(compile_kwargs={"literal_binds": True})
item_count = q.count()
items = []
for item in q:
keys = ["item_id", "labels", "descriptions", "aliases", "sitelinks", "claims"]
item_dict = {key: getattr(item, key) for key in keys}
items.append(item_dict)
return {"count": item_count, "items": items}