diff --git a/config/default.py b/config/default.py new file mode 100644 index 0000000..28f5052 --- /dev/null +++ b/config/default.py @@ -0,0 +1,25 @@ +"""Sample config.""" + + +ID_TAGGING_SCHEMA_DIR = "/var/lib/data/id-tagging-schema" +ID_PRESET_DIR = "/var/lib/data/id-tagging-schema/data/presets/" +GEOIP_DATA = "/var/lib/data/GeoIP/GeoIPCity.dat" +GEOLITE2 = "/var/lib/data/GeoLite2/GeoLite2-City.mmdb" + +CLIENT_KEY = "" +CLIENT_SECRET = "" + +SECRET_KEY = "" + +DEFAULT_COMMENT = "+wikidata" + +ADMIN_NAME = "" +ADMIN_EMAIL = "" +ADMINS = [ADMIN_EMAIL] + +SMTP_HOST = "localhost" +MAIL_FROM = "osm-wikidata@localhost" + +ERROR_MAIL = True + +PROPAGATE_EXCEPTIONS = False diff --git a/matcher/__init__.py b/matcher/__init__.py new file mode 100644 index 0000000..5e9706c --- /dev/null +++ b/matcher/__init__.py @@ -0,0 +1,13 @@ +"""Match OSM and Wikidata items.""" + + +CallParams = dict[str, str | int] + +user_agent = ( + "osm-wikidata/0.1 (https://github.com/EdwardBetts/osm-wikidata; edward@4angle.com)" +) + + +def user_agent_headers() -> dict[str, str]: + """User-Agent headers.""" + return {"User-Agent": user_agent} diff --git a/matcher/api.py b/matcher/api.py index e2beb7c..2f8adda 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -1,19 +1,21 @@ -from sqlalchemy import func, or_, and_, text +import json +import os.path +import re +from collections import Counter, defaultdict + +from flask import current_app, g +from sqlalchemy import and_, func, or_, text +from sqlalchemy.dialects import postgresql from sqlalchemy.orm import selectinload from sqlalchemy.sql import select -from sqlalchemy.sql.expression import literal, union, cast, column +from sqlalchemy.sql.expression import cast, column, literal, union from sqlalchemy.types import Float -from sqlalchemy.dialects import postgresql -from matcher.planet import point, line, polygon -from matcher import model, database, wikidata_api, wikidata -from collections import Counter, defaultdict -from flask import g, current_app -import re -import os.path -import json + +from matcher import database, model, wikidata, wikidata_api +from matcher.planet import line, point, polygon srid = 4326 -re_point = re.compile(r'^POINT\((.+) (.+)\)$') +re_point = re.compile(r"^POINT\((.+) (.+)\)$") entity_keys = {"labels", "sitelinks", "aliases", "claims", "descriptions", "lastrevid"} tag_prefixes = { @@ -37,9 +39,9 @@ skip_tags = { "Key:brand", } + def get_country_iso3166_1(lat, lon): - """ - For a given lat/lon return a set of ISO country codes. + """For a given lat/lon return a set of ISO country codes. Also cache the country code in the global object. @@ -48,8 +50,9 @@ def get_country_iso3166_1(lat, lon): point = func.ST_SetSRID(func.ST_MakePoint(lon, lat), srid) alpha2_codes = set() - q = model.Polygon.query.filter(func.ST_Covers(model.Polygon.way, point), - model.Polygon.admin_level == "2") + q = model.Polygon.query.filter( + func.ST_Covers(model.Polygon.way, point), model.Polygon.admin_level == "2" + ) for country in q: alpha2 = country.tags.get("ISO3166-1") if not alpha2: @@ -60,22 +63,23 @@ def get_country_iso3166_1(lat, lon): return alpha2_codes -def is_street_number_first(lat, lon): +def is_street_number_first(lat: float, lon: float) -> bool: + """Is lat/lon within a country that puts number first in a street address.""" if lat is None or lon is None: return True alpha2 = get_country_iso3166_1(lat, lon) # Incomplete list of countries that put street number first. alpha2_number_first = { - 'GB', # United Kingdom - 'IE', # Ireland - 'US', # United States - 'MX', # Mexico - 'CA', # Canada - 'FR', # France - 'AU', # Australia - 'NZ', # New Zealand - 'ZA', # South Africa + "GB", # United Kingdom + "IE", # Ireland + "US", # United States + "MX", # Mexico + "CA", # Canada + "FR", # France + "AU", # Australia + "NZ", # New Zealand + "ZA", # South Africa } return bool(alpha2_number_first & alpha2) @@ -84,22 +88,26 @@ def is_street_number_first(lat, lon): def make_envelope(bounds): return func.ST_MakeEnvelope(*bounds, srid) + def get_bbox_centroid(bbox): bbox = make_envelope(bbox) centroid = database.session.query(func.ST_AsText(func.ST_Centroid(bbox))).scalar() return reversed(re_point.match(centroid).groups()) + def make_envelope_around_point(lat, lon, distance): conn = database.session.connection() p = func.ST_MakePoint(lon, lat) - s = select([ - func.ST_AsText(func.ST_Project(p, distance, func.radians(0))), - func.ST_AsText(func.ST_Project(p, distance, func.radians(90))), - func.ST_AsText(func.ST_Project(p, distance, func.radians(180))), - func.ST_AsText(func.ST_Project(p, distance, func.radians(270))), - ]) + s = select( + [ + func.ST_AsText(func.ST_Project(p, distance, func.radians(0))), + func.ST_AsText(func.ST_Project(p, distance, func.radians(90))), + func.ST_AsText(func.ST_Project(p, distance, func.radians(180))), + func.ST_AsText(func.ST_Project(p, distance, func.radians(270))), + ] + ) row = conn.execute(s).fetchone() coords = [[float(v) for v in re_point.match(i).groups()] for i in row] @@ -110,49 +118,64 @@ def make_envelope_around_point(lat, lon, distance): return func.ST_MakeEnvelope(west, south, east, north, srid) -def drop_way_area(tags): - """ Remove the way_area field from a tags dict. """ + +def drop_way_area(tags: dict[str, str]) -> dict[str, str]: + """Remove the way_area field from a tags dict.""" if "way_area" in tags: del tags["way_area"] return tags + def get_part_of(table_name, src_id, bbox): - table_map = {'point': point, 'line': line, 'polygon': polygon} + table_map = {"point": point, "line": line, "polygon": polygon} table_alias = table_map[table_name].alias() - s = (select([polygon.c.osm_id, - polygon.c.tags, - func.ST_Area(func.ST_Collect(polygon.c.way))]). - where(and_(func.ST_Intersects(bbox, polygon.c.way), - func.ST_Covers(polygon.c.way, table_alias.c.way), - table_alias.c.osm_id == src_id, - polygon.c.tags.has_key("name"), - or_( - polygon.c.tags.has_key("landuse"), - polygon.c.tags.has_key("amenity"), - ))). - group_by(polygon.c.osm_id, polygon.c.tags)) + s = ( + select( + [ + polygon.c.osm_id, + polygon.c.tags, + func.ST_Area(func.ST_Collect(polygon.c.way)), + ] + ) + .where( + and_( + func.ST_Intersects(bbox, polygon.c.way), + func.ST_Covers(polygon.c.way, table_alias.c.way), + table_alias.c.osm_id == src_id, + polygon.c.tags.has_key("name"), + or_( + polygon.c.tags.has_key("landuse"), + polygon.c.tags.has_key("amenity"), + ), + ) + ) + .group_by(polygon.c.osm_id, polygon.c.tags) + ) conn = database.session.connection() - return [{ - "type": "way" if osm_id > 0 else "relation", - "id": abs(osm_id), - "tags": tags, - "area": area, - } for osm_id, tags, area in conn.execute(s)] + return [ + { + "type": "way" if osm_id > 0 else "relation", + "id": abs(osm_id), + "tags": tags, + "area": area, + } + for osm_id, tags, area in conn.execute(s) + ] -def get_and_save_item(qid): - """ Download an item from Wikidata and cache it in the database. """ +def get_and_save_item(qid: str) -> model.Item | None: + """Download an item from Wikidata and cache it in the database.""" entity = wikidata_api.get_entity(qid) entity_qid = entity["id"] if entity_qid != qid: - print(f'redirect {qid} -> {entity_qid}') + print(f"redirect {qid} -> {entity_qid}") item = model.Item.query.get(entity_qid[1:]) return item if "claims" not in entity: - return + return None coords = wikidata.get_entity_coords(entity["claims"]) item_id = int(qid[1:]) @@ -171,8 +194,9 @@ def get_and_save_item(qid): return item -def get_isa_count(items): - isa_count = Counter() +def get_isa_count(items: list[model.Item]) -> list[tuple[int, int]]: + """List of IsA counts.""" + isa_count: Counter[int] = Counter() for item in items: if not item: continue @@ -199,13 +223,11 @@ def get_items_in_bbox(bbox): def get_osm_with_wikidata_tag(bbox, isa_filter=None): - bbox_str = ','.join(str(v) for v in bbox) + bbox_str = ",".join(str(v) for v in bbox) extra_sql = "" if isa_filter: - q = ( - model.Item.query.join(model.ItemLocation) - .filter(func.ST_Covers(make_envelope(bbox), - model.ItemLocation.location)) + q = model.Item.query.join(model.ItemLocation).filter( + func.ST_Covers(make_envelope(bbox), model.ItemLocation.location) ) q = add_isa_filter(q, isa_filter) qids = [isa.qid for isa in q] @@ -216,7 +238,8 @@ def get_osm_with_wikidata_tag(bbox, isa_filter=None): extra_sql += f" AND tags -> 'wikidata' in ({qid_list})" # easier than building this query with SQLAlchemy - sql = f''' + sql = ( + f""" SELECT tbl, osm_id, tags, ARRAY[ST_Y(centroid), ST_X(centroid)], geojson FROM ( SELECT 'point' as tbl, osm_id, tags, ST_AsText(ST_Centroid(way)) as centroid, ST_AsGeoJSON(way) as geojson @@ -235,24 +258,29 @@ UNION HAVING st_area(st_collect(way)) < 20 * st_area(ST_MakeEnvelope({bbox_str}, {srid})) ) as anon WHERE tags ? 'wikidata' -''' + extra_sql +""" + + extra_sql + ) conn = database.session.connection() result = conn.execute(text(sql)) print(sql) - point_sql = f''' + point_sql = ( + f""" SELECT 'point' as tbl, osm_id, tags, ST_AsText(ST_Centroid(way)) as centroid, ST_AsGeoJSON(way) as geojson FROM planet_osm_point WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way) and tags ? 'wikidata' -''' + extra_sql +""" + + extra_sql + ) print("point") print(point_sql) tagged = [] for tbl, osm_id, tags, centroid, geojson in result: - if tbl == 'point': + if tbl == "point": osm_type = "node" else: osm_type = "way" if osm_id > 0 else "relation" @@ -260,15 +288,17 @@ WHERE tags ? 'wikidata' name = tags.get("name") or tags.get("addr:housename") or "[no label]" - tagged.append({ - "identifier": f"{osm_type}/{osm_id}", - "id": osm_id, - "type": osm_type, - "geojson": json.loads(geojson), - "centroid": centroid, - "name": name, - "wikidata": tags["wikidata"], - }) + tagged.append( + { + "identifier": f"{osm_type}/{osm_id}", + "id": osm_id, + "type": osm_type, + "geojson": json.loads(geojson), + "centroid": centroid, + "name": name, + "wikidata": tags["wikidata"], + } + ) return tagged @@ -310,11 +340,13 @@ def get_item_tags(item): isa, isa_path = isa_items.pop() if not isa: continue - isa_path = isa_path + [{'qid': isa.qid, 'label': isa.label()}] + isa_path = isa_path + [{"qid": isa.qid, "label": isa.label()}] osm = [v for v in isa.get_claim("P1282") if v not in skip_tags] - osm += [extra.tag_or_key - for extra in model.ItemExtraKeys.query.filter_by(item_id=isa.item_id)] + osm += [ + extra.tag_or_key + for extra in model.ItemExtraKeys.query.filter_by(item_id=isa.item_id) + ] for i in osm: osm_list[i].append(isa_path[:]) @@ -369,14 +401,16 @@ def get_tags_for_isa_item(item): isa, isa_path = isa_items.pop() if not isa: continue - isa_path = isa_path + [{'qid': isa.qid, 'label': isa.label()}] + isa_path = isa_path + [{"qid": isa.qid, "label": isa.label()}] if isa.item_id not in items_checked_done: - items_checked.append({'qid': isa.qid, 'label': isa.label()}) + items_checked.append({"qid": isa.qid, "label": isa.label()}) items_checked_done.add(isa.item_id) osm = [v for v in isa.get_claim("P1282") if v not in skip_tags] - osm += [extra.tag_or_key - for extra in model.ItemExtraKeys.query.filter_by(item_id=isa.item_id)] + osm += [ + extra.tag_or_key + for extra in model.ItemExtraKeys.query.filter_by(item_id=isa.item_id) + ] for i in osm: osm_list[i].append(isa_path[:]) @@ -403,34 +437,31 @@ def get_tags_for_isa_item(item): seen.update(isa_list) isa_items += [(isa, isa_path) for isa in get_items(isa_list)] return { - 'tags': {key: list(values) for key, values in osm_list.items()}, - 'checked': items_checked, + "tags": {key: list(values) for key, values in osm_list.items()}, + "checked": items_checked, } def add_isa_filter(q, isa_qids): - q_subclass = database.session.query(model.Item.qid).filter( func.jsonb_path_query_array( model.Item.claims, - '$.P279[*].mainsnak.datavalue.value.id', - ).bool_op('?|')(list(isa_qids)) + "$.P279[*].mainsnak.datavalue.value.id", + ).bool_op("?|")(list(isa_qids)) ) subclass_qid = {qid for qid, in q_subclass.all()} isa = func.jsonb_path_query_array( model.Item.claims, - '$.P31[*].mainsnak.datavalue.value.id', - ).bool_op('?|') + "$.P31[*].mainsnak.datavalue.value.id", + ).bool_op("?|") return q.filter(isa(list(isa_qids | subclass_qid))) def wikidata_items_count(bounds, isa_filter=None): - - q = ( - model.Item.query.join(model.ItemLocation) - .filter(func.ST_Covers(make_envelope(bounds), model.ItemLocation.location)) + q = model.Item.query.join(model.ItemLocation).filter( + func.ST_Covers(make_envelope(bounds), model.ItemLocation.location) ) if isa_filter: @@ -440,12 +471,12 @@ def wikidata_items_count(bounds, isa_filter=None): return q.count() + def wikidata_isa_counts(bounds, isa_filter=None): db_bbox = make_envelope(bounds) - q = ( - model.Item.query.join(model.ItemLocation) - .filter(func.ST_Covers(db_bbox, model.ItemLocation.location)) + q = model.Item.query.join(model.ItemLocation).filter( + func.ST_Covers(db_bbox, model.ItemLocation.location) ) if isa_filter: @@ -474,12 +505,13 @@ def wikidata_isa_counts(bounds, isa_filter=None): return isa_count + def get_tag_filter(tags, tag_list): tag_filter = [] for tag_or_key in tag_list: if tag_or_key.startswith("Key:"): key = tag_or_key[4:] - tag_filter.append(and_(tags.has_key(key), tags[key] != 'no')) + tag_filter.append(and_(tags.has_key(key), tags[key] != "no")) for prefix in tag_prefixes: tag_filter.append(tags.has_key(f"{prefix}:{key}")) @@ -495,11 +527,11 @@ def get_tag_filter(tags, tag_list): def get_preset_translations(): app = current_app country_language = { - 'AU': 'en-AU', # Australia - 'GB': 'en-GB', # United Kingdom - 'IE': 'en-GB', # Ireland - 'IN': 'en-IN', # India - 'NZ': 'en-NZ', # New Zealand + "AU": "en-AU", # Australia + "GB": "en-GB", # United Kingdom + "IE": "en-GB", # Ireland + "IN": "en-IN", # India + "NZ": "en-NZ", # New Zealand } ts_dir = app.config["ID_TAGGING_SCHEMA_DIR"] translation_dir = os.path.join(ts_dir, "dist", "translations") @@ -520,13 +552,14 @@ def get_preset_translations(): return {} + def get_presets_from_tags(ending, tags): translations = get_preset_translations() found = [] for k, v in tags.items(): - if k == 'amenity' and v == 'clock' and tags.get('display') == 'sundial': + if k == "amenity" and v == "clock" and tags.get("display") == "sundial": tag_or_key = f"Tag:{k}={v}" found.append({"tag_or_key": tag_or_key, "name": "Sundial"}) continue @@ -604,8 +637,7 @@ def address_node_label(tags): def get_address_nodes_within_building(osm_id, bbox_list): q = model.Point.query.filter( polygon.c.osm_id == osm_id, - or_(*[func.ST_Intersects(bbox, model.Point.way) - for bbox in bbox_list]), + or_(*[func.ST_Intersects(bbox, model.Point.way) for bbox in bbox_list]), func.ST_Covers(polygon.c.way, model.Point.way), model.Point.tags.has_key("addr:street"), model.Point.tags.has_key("addr:housenumber"), @@ -615,8 +647,14 @@ def get_address_nodes_within_building(osm_id, bbox_list): def osm_display_name(tags): - keys = ("bridge:name", "tunnel:name", "lock_name", "name", "addr:housename", - "inscription") + keys = ( + "bridge:name", + "tunnel:name", + "lock_name", + "name", + "addr:housename", + "inscription", + ) for key in keys: if key in tags: return tags[key] @@ -625,6 +663,7 @@ def osm_display_name(tags): def street_address_in_tags(tags): return "addr:housenumber" in tags and "addr:street" in tags + def find_osm_candidates(item, limit=80, max_distance=450, names=None): item_id = item.item_id item_is_linear_feature = item.is_linear_feature() @@ -637,51 +676,94 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): check_is_street_number_first(item.locations[0].get_lat_lon()) - bbox_list = [make_envelope_around_point(*loc.get_lat_lon(), max_distance) - for loc in item.locations] + bbox_list = [ + make_envelope_around_point(*loc.get_lat_lon(), max_distance) + for loc in item.locations + ] null_area = cast(None, Float) - dist = column('dist') - tags = column('tags', postgresql.HSTORE) + dist = column("dist") + tags = column("tags", postgresql.HSTORE) tag_list = get_item_tags(item) # tag_filters = get_tag_filter(point.c.tags, tag_list) # print(tag_filters) - s_point = (select([literal('point').label('t'), point.c.osm_id, point.c.tags.label('tags'), - func.min(func.ST_DistanceSphere(model.ItemLocation.location, point.c.way)).label('dist'), - func.ST_AsText(point.c.way), - func.ST_AsGeoJSON(point.c.way), - null_area]). - where(and_( - or_(*[func.ST_Intersects(bbox, point.c.way) - for bbox in bbox_list]), - model.ItemLocation.item_id == item_id, - or_(*get_tag_filter(point.c.tags, tag_list)))). - group_by(point.c.osm_id, point.c.tags, point.c.way)) + s_point = ( + select( + [ + literal("point").label("t"), + point.c.osm_id, + point.c.tags.label("tags"), + func.min( + func.ST_DistanceSphere(model.ItemLocation.location, point.c.way) + ).label("dist"), + func.ST_AsText(point.c.way), + func.ST_AsGeoJSON(point.c.way), + null_area, + ] + ) + .where( + and_( + or_(*[func.ST_Intersects(bbox, point.c.way) for bbox in bbox_list]), + model.ItemLocation.item_id == item_id, + or_(*get_tag_filter(point.c.tags, tag_list)), + ) + ) + .group_by(point.c.osm_id, point.c.tags, point.c.way) + ) - s_line = (select([literal('line').label('t'), line.c.osm_id, line.c.tags.label('tags'), - func.min(func.ST_DistanceSphere(model.ItemLocation.location, line.c.way)).label('dist'), - func.ST_AsText(func.ST_Centroid(func.ST_Collect(line.c.way))), - func.ST_AsGeoJSON(func.ST_Collect(line.c.way)), - null_area]). - where(and_( - or_(*[func.ST_Intersects(bbox, line.c.way) for bbox in bbox_list]), - model.ItemLocation.item_id == item_id, - or_(*get_tag_filter(line.c.tags, tag_list)))). - group_by(line.c.osm_id, line.c.tags)) + s_line = ( + select( + [ + literal("line").label("t"), + line.c.osm_id, + line.c.tags.label("tags"), + func.min( + func.ST_DistanceSphere(model.ItemLocation.location, line.c.way) + ).label("dist"), + func.ST_AsText(func.ST_Centroid(func.ST_Collect(line.c.way))), + func.ST_AsGeoJSON(func.ST_Collect(line.c.way)), + null_area, + ] + ) + .where( + and_( + or_(*[func.ST_Intersects(bbox, line.c.way) for bbox in bbox_list]), + model.ItemLocation.item_id == item_id, + or_(*get_tag_filter(line.c.tags, tag_list)), + ) + ) + .group_by(line.c.osm_id, line.c.tags) + ) - s_polygon = (select([literal('polygon').label('t'), polygon.c.osm_id, polygon.c.tags.label('tags'), - func.min(func.ST_DistanceSphere(model.ItemLocation.location, polygon.c.way)).label('dist'), - func.ST_AsText(func.ST_Centroid(func.ST_Collect(polygon.c.way))), - func.ST_AsGeoJSON(func.ST_Collect(polygon.c.way)), - func.ST_Area(func.ST_Collect(polygon.c.way))]). - where(and_( - or_(*[func.ST_Intersects(bbox, polygon.c.way) for bbox in bbox_list]), - model.ItemLocation.item_id == item_id, - or_(*get_tag_filter(polygon.c.tags, tag_list)))). - group_by(polygon.c.osm_id, polygon.c.tags). - having(func.ST_Area(func.ST_Collect(polygon.c.way)) < 20 * func.ST_Area(bbox_list[0]))) + s_polygon = ( + select( + [ + literal("polygon").label("t"), + polygon.c.osm_id, + polygon.c.tags.label("tags"), + func.min( + func.ST_DistanceSphere(model.ItemLocation.location, polygon.c.way) + ).label("dist"), + func.ST_AsText(func.ST_Centroid(func.ST_Collect(polygon.c.way))), + func.ST_AsGeoJSON(func.ST_Collect(polygon.c.way)), + func.ST_Area(func.ST_Collect(polygon.c.way)), + ] + ) + .where( + and_( + or_(*[func.ST_Intersects(bbox, polygon.c.way) for bbox in bbox_list]), + model.ItemLocation.item_id == item_id, + or_(*get_tag_filter(polygon.c.tags, tag_list)), + ) + ) + .group_by(polygon.c.osm_id, polygon.c.tags) + .having( + func.ST_Area(func.ST_Collect(polygon.c.way)) + < 20 * func.ST_Area(bbox_list[0]) + ) + ) tables = ([] if item_is_linear_feature else [s_point]) + [s_line, s_polygon] s = select([union(*tables).alias()]).where(dist < max_distance).order_by(dist) @@ -695,10 +777,14 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): s = s.where(tags.has_key("name")) if "Key:amenity" in tag_list: - s = s.where(and_(tags["amenity"] != "bicycle_parking", - tags["amenity"] != "bicycle_repair_station", - tags["amenity"] != "atm", - tags["amenity"] != "recycling")) + s = s.where( + and_( + tags["amenity"] != "bicycle_parking", + tags["amenity"] != "bicycle_repair_station", + tags["amenity"] != "atm", + tags["amenity"] != "recycling", + ) + ) if limit: s = s.limit(limit) @@ -730,6 +816,8 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): shape = "area" if table == "polygon" else table + item_identifier_tags = item.get_identifiers_tags() + cur = { "identifier": f"{osm_type}/{osm_id}", "type": osm_type, @@ -748,8 +836,9 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): part_of = [] for bbox in bbox_list: - part_of += [i for i in get_part_of(table, src_id, bbox) - if i["tags"]["name"] != name] + part_of += [ + i for i in get_part_of(table, src_id, bbox) if i["tags"]["name"] != name + ] if part_of: cur["part_of"] = part_of @@ -760,9 +849,9 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): return nearby -def get_item(item_id): - """ Retrieve a Wikidata item, either from the database or from Wikidata. """ +def get_item(item_id): + """Retrieve a Wikidata item, either from the database or from Wikidata.""" item = model.Item.query.get(item_id) return item or get_and_save_item(f"Q{item_id}") @@ -774,7 +863,7 @@ def get_item_street_addresses(item): for claim in item.claims["P669"]: qualifiers = claim.get("qualifiers") - if not qualifiers or 'P670' not in qualifiers: + if not qualifiers or "P670" not in qualifiers: continue number = qualifiers["P670"][0]["datavalue"]["value"] @@ -782,24 +871,26 @@ def get_item_street_addresses(item): street = street_item.label() for q in qualifiers["P670"]: number = q["datavalue"]["value"] - address = (f"{number} {street}" - if g.street_number_first - else f"{street} {number}") + address = ( + f"{number} {street}" if g.street_number_first else f"{street} {number}" + ) street_address.append(address) return street_address + def check_is_street_number_first(latlng): g.street_number_first = is_street_number_first(*latlng) + def item_detail(item): unsupported_relation_types = { - 'Q194356', # wind farm - 'Q2175765', # tram stop + "Q194356", # wind farm + "Q2175765", # tram stop } locations = [list(i.get_lat_lon()) for i in item.locations] - if not hasattr(g, 'street_number_first'): + if not hasattr(g, "street_number_first"): g.street_number_first = is_street_number_first(*locations[0]) image_filenames = item.get_claim("P18") @@ -809,20 +900,24 @@ def item_detail(item): heritage_designation = [] for v in item.get_claim("P1435"): if not v: - print('heritage designation missing:', item.qid) + print("heritage designation missing:", item.qid) continue heritage_designation_item = get_item(v["numeric-id"]) - heritage_designation.append({ - "qid": v["id"], - "label": heritage_designation_item.label(), - }) + heritage_designation.append( + { + "qid": v["id"], + "label": heritage_designation_item.label(), + } + ) isa_items = [get_item(isa["numeric-id"]) for isa in item.get_isa()] isa_lookup = {isa.qid: isa for isa in isa_items} - wikipedia_links = [{"lang": site[:-4], "title": link["title"]} - for site, link in sorted(item.sitelinks.items()) - if site.endswith("wiki") and len(site) < 8] + wikipedia_links = [ + {"lang": site[:-4], "title": link["title"]} + for site, link in sorted(item.sitelinks.items()) + if site.endswith("wiki") and len(site) < 8 + ] d = { "qid": item.qid, @@ -831,7 +926,9 @@ def item_detail(item): "markers": locations, "image_list": image_filenames, "street_address": street_address, - "isa_list": [{"qid": isa.qid, "label": isa.label()} for isa in isa_items if isa], + "isa_list": [ + {"qid": isa.qid, "label": isa.label()} for isa in isa_items if isa + ], "closed": item.closed(), "inception": item.time_claim("P571"), "p1619": item.time_claim("P1619"), @@ -849,8 +946,9 @@ def item_detail(item): unsupported = isa_lookup.keys() & unsupported_relation_types if unsupported: - d["unsupported_relation_types"] = [isa for isa in d["isa_list"] - if isa["qid"] in isa_lookup] + d["unsupported_relation_types"] = [ + isa for isa in d["isa_list"] if isa["qid"] in isa_lookup + ] return d @@ -889,7 +987,7 @@ def wikidata_items(bounds, isa_filter=None): } isa_count.append(isa) - return {'items': items, 'isa_count': isa_count} + return {"items": items, "isa_count": isa_count} def missing_wikidata_items(qids, lat, lon): @@ -924,12 +1022,13 @@ def missing_wikidata_items(qids, lat, lon): return dict(items=items, isa_count=isa_count) + def isa_incremental_search(search_terms): en_label = func.jsonb_extract_path_text(model.Item.labels, "en", "value") q = model.Item.query.filter( - model.Item.claims.has_key("P1282"), - en_label.ilike(f"%{search_terms}%"), - func.length(en_label) < 20, + model.Item.claims.has_key("P1282"), + en_label.ilike(f"%{search_terms}%"), + func.length(en_label) < 20, ) print(q.statement.compile(compile_kwargs={"literal_binds": True})) @@ -943,13 +1042,18 @@ def isa_incremental_search(search_terms): ret.append(cur) return ret -def get_place_items(osm_type, osm_id): - src_id = osm_id * {'way': 1, 'relation': -1}[osm_type] - q = (model.Item.query - .join(model.ItemLocation) - .join(model.Polygon, func.ST_Covers(model.Polygon.way, model.ItemLocation.location)) - .filter(model.Polygon.src_id == src_id)) +def get_place_items(osm_type, osm_id): + src_id = osm_id * {"way": 1, "relation": -1}[osm_type] + + q = ( + model.Item.query.join(model.ItemLocation) + .join( + model.Polygon, + func.ST_Covers(model.Polygon.way, model.ItemLocation.location), + ) + .filter(model.Polygon.src_id == src_id) + ) # sql = q.statement.compile(compile_kwargs={"literal_binds": True}) item_count = q.count() diff --git a/matcher/commons.py b/matcher/commons.py index b65658c..8c537e9 100644 --- a/matcher/commons.py +++ b/matcher/commons.py @@ -1,18 +1,25 @@ -import requests +"""Use mediawiki API to look up images on Wikimedia Commons.""" + import urllib.parse -from . import utils +from typing import Any + +import requests + +from . import CallParams, utils commons_start = "http://commons.wikimedia.org/wiki/Special:FilePath/" commons_url = "https://www.wikidata.org/w/api.php" page_size = 50 -def commons_uri_to_filename(uri): +def commons_uri_to_filename(uri: str) -> str: + """Given the URI for a file on commons return the filename of the file.""" return urllib.parse.unquote(utils.drop_start(uri, commons_start)) -def api_call(params): - call_params = { +def api_call(params: CallParams) -> requests.Response: + """Call the Commons API.""" + call_params: CallParams = { "format": "json", "formatversion": 2, **params, @@ -21,8 +28,11 @@ def api_call(params): return requests.get(commons_url, params=call_params, timeout=5) -def image_detail(filenames, thumbheight=None, thumbwidth=None): - params = { +def image_detail( + filenames: list[str], thumbheight: int | None = None, thumbwidth: int | None = None +) -> dict[str, Any]: + """Detail for multiple images.""" + params: CallParams = { "action": "query", "prop": "imageinfo", "iiprop": "url", @@ -32,7 +42,7 @@ def image_detail(filenames, thumbheight=None, thumbwidth=None): if thumbwidth is not None: params["iiurlwidth"] = thumbwidth - images = {} + images: dict[str, Any] = {} for cur in utils.chunk(filenames, page_size): call_params = params.copy() diff --git a/matcher/database.py b/matcher/database.py new file mode 100644 index 0000000..e2e7398 --- /dev/null +++ b/matcher/database.py @@ -0,0 +1,39 @@ +"""Database functions.""" + +import flask +import sqlalchemy +from sqlalchemy import create_engine, func +from sqlalchemy.engine import reflection +from sqlalchemy.orm import scoped_session, sessionmaker + +session: sqlalchemy.orm.scoping.scoped_session = scoped_session(sessionmaker()) + + +def init_db(db_url: str, echo: bool = False) -> None: + """Initialise database.""" + session.configure(bind=get_engine(db_url, echo=echo)) + + +def get_engine(db_url: str, echo: bool = False) -> sqlalchemy.engine.base.Engine: + """Create an engine objcet.""" + return create_engine(db_url, pool_recycle=3600, echo=echo) + + +def get_tables() -> list[str]: + """Get a list of table names.""" + tables: list[str] = reflection.Inspector.from_engine(session.bind).get_table_names() + return tables + + +def init_app(app: flask.app.Flask, echo: bool = False) -> None: + """Initialise database connection within flask app.""" + db_url = app.config["DB_URL"] + session.configure(bind=get_engine(db_url, echo=echo)) + + @app.teardown_appcontext + def shutdown_session(exception: Exception | None = None) -> None: + session.remove() + + +def now_utc(): + return func.timezone("utc", func.now()) diff --git a/matcher/edit.py b/matcher/edit.py new file mode 100644 index 0000000..664b408 --- /dev/null +++ b/matcher/edit.py @@ -0,0 +1,71 @@ +from flask import g +from . import user_agent_headers, database, osm_oauth, mail +from .model import Changeset +import requests +import html + +really_save = True +osm_api_base = "https://api.openstreetmap.org/api/0.6" + + +def new_changeset(comment): + return f""" + + + + + +""" + + +def osm_request(path, **kwargs): + return osm_oauth.api_put_request(path, **kwargs) + + +def create_changeset(changeset): + try: + return osm_request("/changeset/create", data=changeset.encode("utf-8")) + except requests.exceptions.HTTPError as r: + print(changeset) + print(r.response.text) + raise + + +def close_changeset(changeset_id): + return osm_request(f"/changeset/{changeset_id}/close") + + +def save_element(osm_type, osm_id, element_data): + osm_path = f"/{osm_type}/{osm_id}" + r = osm_request(osm_path, data=element_data) + reply = r.text.strip() + if reply.isdigit(): + return r + + subject = f"matcher error saving element: {osm_path}" + username = g.user.username + body = f""" +https://www.openstreetmap.org{osm_path} + +user: {username} +message user: https://www.openstreetmap.org/message/new/{username} + +error: +{reply} +""" + + mail.send_mail(subject, body) + + +def record_changeset(**kwargs): + change = Changeset(created=database.now_utc(), **kwargs) + + database.session.add(change) + database.session.commit() + + return change + + +def get_existing(osm_type, osm_id): + url = f"{osm_api_base}/{osm_type}/{osm_id}" + return requests.get(url, headers=user_agent_headers()) diff --git a/matcher/error_mail.py b/matcher/error_mail.py index a7a6563..6e8c888 100644 --- a/matcher/error_mail.py +++ b/matcher/error_mail.py @@ -1,13 +1,19 @@ +"""Send mail to admins when there is an error.""" + import logging -from logging.handlers import SMTPHandler from logging import Formatter -from flask import request +from logging.handlers import SMTPHandler + +import flask PROJECT = "osm-wikidata" class MatcherSMTPHandler(SMTPHandler): - def getSubject(self, record): # noqa: N802 + """Custom SMTP handler to change subject line.""" + + def getSubject(self, record: logging.LogRecord) -> str: # noqa: N802 + """Return subject line for error mail.""" return ( f"{PROJECT} error: {record.exc_info[0].__name__}" if (record.exc_info and record.exc_info[0]) @@ -16,12 +22,16 @@ class MatcherSMTPHandler(SMTPHandler): class RequestFormatter(Formatter): - def format(self, record): - record.request = request + """Custom request formatter.""" + + def format(self, record: logging.LogRecord) -> str: + """Add request to log record.""" + record.request = flask.request return super().format(record) -def setup_error_mail(app): +def setup_error_mail(app: flask.Flask) -> None: + """Configure logging to catch errors and email them.""" if not app.config.get("ERROR_MAIL"): return formatter = RequestFormatter( diff --git a/matcher/planet.py b/matcher/planet.py index 57f7b44..15b8016 100644 --- a/matcher/planet.py +++ b/matcher/planet.py @@ -1,24 +1,32 @@ -from sqlalchemy import Table, Column, Integer, String, Float, MetaData -from sqlalchemy.dialects import postgresql +"""Planet tables.""" + from geoalchemy2 import Geometry +from sqlalchemy import Column, Float, Integer, MetaData, String, Table +from sqlalchemy.dialects import postgresql metadata = MetaData() -point = Table("planet_osm_point", metadata, +point = Table( + "planet_osm_point", + metadata, Column("osm_id", Integer), Column("name", String), Column("tags", postgresql.HSTORE), Column("way", Geometry("GEOMETRY", srid=4326, spatial_index=True), nullable=False), ) -line = Table("planet_osm_line", metadata, +line = Table( + "planet_osm_line", + metadata, Column("osm_id", Integer), Column("name", String), Column("tags", postgresql.HSTORE), Column("way", Geometry("GEOMETRY", srid=4326, spatial_index=True), nullable=False), ) -polygon = Table("planet_osm_polygon", metadata, +polygon = Table( + "planet_osm_polygon", + metadata, Column("osm_id", Integer), Column("name", String), Column("tags", postgresql.HSTORE), diff --git a/matcher/utils.py b/matcher/utils.py index 0cace13..36bbe99 100644 --- a/matcher/utils.py +++ b/matcher/utils.py @@ -1,113 +1,139 @@ -from flask import current_app, request -from itertools import islice -import os.path +"""Utility functions.""" + import json import math -import user_agents +import os.path import re +import typing from datetime import date +from itertools import islice +from typing import Any, cast + +import flask +import user_agents from num2words import num2words metres_per_mile = 1609.344 feet_per_metre = 3.28084 feet_per_mile = 5280 +T = typing.TypeVar("T") -def chunk(it, size): + +def chunk(it: typing.Iterable[T], size: int) -> typing.Iterator[tuple[T, ...]]: + """Split an iterable into chunks of the given size.""" it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) -def flatten(l): - return [item for sublist in l for item in sublist] +def flatten(top_list: list[list[T]]) -> list[T]: + """Flatten a list.""" + return [item for sub_list in top_list for item in sub_list] -def drop_start(s, start): +def drop_start(s: str, start: str) -> str: + """Remove string prefix, otherwise throw an error.""" assert s.startswith(start) return s[len(start) :] -def remove_start(s, start): +def remove_start(s: str, start: str) -> str: + """Remove a string prefix, if present.""" return s[len(start) :] if s.startswith(start) else s -def normalize_url(url): +def normalize_url(url: str) -> str: + """Standardize URLs to help in comparison.""" for start in "http://", "https://", "www.": url = remove_start(url, start) return url.rstrip("/") -def contains_digit(s): +def contains_digit(s: str) -> bool: + """Check if string contains a digit.""" return any(c.isdigit() for c in s) -def cache_dir(): - return current_app.config["CACHE_DIR"] +def cache_dir() -> str: + """Get cache dir location.""" + d: str = flask.current_app.config["CACHE_DIR"] + return d -def cache_filename(filename): +def cache_filename(filename: str) -> str: + """Get absolute path for cache file.""" return os.path.join(cache_dir(), filename) -def load_from_cache(filename): +def load_from_cache(filename: str) -> Any: + """Load JSON data from cache.""" return json.load(open(cache_filename(filename))) -def get_radius(default=1000): - arg_radius = request.args.get("radius") +def get_radius(default: int = 1000) -> int | None: + """Get radius request argument with default.""" + arg_radius = flask.request.args.get("radius") return int(arg_radius) if arg_radius and arg_radius.isdigit() else default -def get_int_arg(name): - if name in request.args and request.args[name].isdigit(): - return int(request.args[name]) +def get_int_arg(name: str) -> int | None: + """Get an request arg and convert to integer.""" + v = flask.request.args.get(name) + return int(v) if v and v.isdigit() else None -def calc_chunk_size(area_in_sq_km, size=22): +def calc_chunk_size(area_in_sq_km: float, size: int = 22) -> int: + """Work out the size of a chunk.""" side = math.sqrt(area_in_sq_km) return max(1, math.ceil(side / size)) -def file_missing_or_empty(filename): +def file_missing_or_empty(filename: str) -> bool: + """Check if a file is missing or empty.""" return os.path.exists(filename) or os.stat(filename).st_size == 0 -def is_bot(): - """ Is the current request from a web robot? """ - ua = request.headers.get("User-Agent") - return ua and user_agents.parse(ua).is_bot +def is_bot() -> bool: + """Is the current request from a web robot.""" + ua = flask.request.headers.get("User-Agent") + return bool(ua and user_agents.parse(ua).is_bot) -def log_location(): - return current_app.config["LOG_DIR"] +def log_location() -> str: + """Get log location from Flask config.""" + return cast(str, flask.current_app.config["LOG_DIR"]) -def good_location(): - return os.path.join(log_location(), "complete") - - -def capfirst(value): - """ Uppercase first letter of string, leave rest as is. """ +def capfirst(value: str) -> str: + """Uppercase first letter of string, leave rest as is.""" return value[0].upper() + value[1:] if value else value -def any_upper(value): +def any_upper(value: str) -> bool: + """Check if string contains any uppercase characters.""" return any(c.isupper() for c in value) -def find_log_file(place): - start = f"{place.place_id}_" - for f in os.scandir(good_location()): - if f.name.startswith(start): - return f.path - - -def get_free_space(config): +def get_free_space(config: flask.config.Config) -> int: + """Return the amount of available free space.""" s = os.statvfs(config["FREE_SPACE_PATH"]) return s.f_bsize * s.f_bavail -def display_distance(units, dist): +def metric_display_distance(units: str, dist: float) -> str | None: + """Convert distance from metres to the specified metric units.""" + if units == "km_and_metres": + units = "km" if dist > 500 else "metres" + if units == "metres": + return f"{dist:,.0f} m" + if units == "km": + return f"{dist / 1000:,.2f} km" + + return None + + +def display_distance(units: str, dist: float) -> str | None: + """Convert distance from metres to the specified units.""" if units in ("miles_and_feet", "miles_and_yards"): total_feet = dist * feet_per_metre miles = total_feet / feet_per_mile @@ -124,20 +150,15 @@ def display_distance(units, dist): miles = dist / metres_per_mile return f"{miles:,.2f} miles" if miles > 0.5 else f"{dist:,.0f} metres" - if units == "km_and_metres": - units = "km" if dist > 500 else "metres" - if units == "metres": - return f"{dist:,.0f} m" - if units == "km": - return f"{dist / 1000:,.2f} km" + return metric_display_distance(units, dist) -re_range = re.compile(r"\b(\d+) ?(?:to|-) ?(\d+)\b", re.I) -re_number_list = re.compile(r"\b([\d, ]+) (?:and|&) (\d+)\b", re.I) -re_number = re.compile(r"^(?:No\.?|Number)? ?(\d+)\b") +def is_in_range(address_range: str, address: str) -> bool: + """Check if an address is within a range.""" + re_range = re.compile(r"\b(\d+) ?(?:to|-) ?(\d+)\b", re.I) + re_number_list = re.compile(r"\b([\d, ]+) (?:and|&) (\d+)\b", re.I) + re_number = re.compile(r"^(?:No\.?|Number)? ?(\d+)\b") - -def is_in_range(address_range, address): m_number = re_number.match(address) if not m_number: return False @@ -159,20 +180,27 @@ def is_in_range(address_range, address): return False -def format_wikibase_time(v): - p = v["precision"] +class WikibaseTime(typing.TypedDict): + """Wikibase Time dict.""" + + precision: int + time: str + + +def format_wikibase_time(v: WikibaseTime) -> str | None: + """Format wikibase time value into human readable string.""" t = v["time"] - # TODO: handle dates with century precision (7) - # example: https://www.wikidata.org/wiki/Q108266998 - - if p == 11: - return date.fromisoformat(t[1:11]).strftime("%-d %B %Y") - if p == 10: - return date.fromisoformat(t[1:8] + "-01").strftime("%B %Y") - if p == 9: - return t[1:5] - if p == 7: - century = ((int(t[:5]) - 1) // 100) + 1 - end = " BC" if century < 0 else "" - return num2words(abs(century), to="ordinal_num") + " century" + end + match v["precision"]: + case 11: # year, month and day + return date.fromisoformat(t[1:11]).strftime("%-d %B %Y") + case 10: # year and month + return date.fromisoformat(t[1:8] + "-01").strftime("%B %Y") + case 9: # year + return t[1:5] + case 7: # century + century = ((int(t[:5]) - 1) // 100) + 1 + ordinal_num: str = num2words(abs(century), to="ordinal_num") + return f"{ordinal_num} {century}{' BC' if century < 0 else ''}" + case _: # not handled + return None diff --git a/package.json b/package.json index 4de4ebc..2cb6641 100644 --- a/package.json +++ b/package.json @@ -6,12 +6,12 @@ "test": "echo \"This template does not include a test runner by default.\" && exit 1" }, "dependencies": { - "bootstrap": "^5.1.3", + "@popperjs/core": "^2.11.0", "fork-awesome": "^1.2.0", - "leaflet": "^1.7.1", + "leaflet": "^1.8.0", "leaflet-extra-markers": "^1.2.1", "redaxios": "^0.4.1", - "vue": "^3.1.15" + "vue": "^3.2.26" }, "devDependencies": { "@snowpack/plugin-dotenv": "^2.1.0", diff --git a/snowpack.config.mjs b/snowpack.config.mjs index f42ac69..167d5bb 100644 --- a/snowpack.config.mjs +++ b/snowpack.config.mjs @@ -1,9 +1,7 @@ -import pkg from './package.json'; - /** @type {import("snowpack").SnowpackUserConfig } */ export default { mount: { - // public: {url: '/', static: true}, + public: {url: '/', static: true}, frontend: {url: '/dist'}, }, plugins: [ diff --git a/templates/isa.html b/templates/isa.html index d198d9f..edb2950 100644 --- a/templates/isa.html +++ b/templates/isa.html @@ -14,7 +14,7 @@
-
+
diff --git a/templates/map.html b/templates/map.html index a7fde40..1c4a85c 100644 --- a/templates/map.html +++ b/templates/map.html @@ -4,10 +4,13 @@ Wikidata items linked to OSM + @@ -16,7 +19,7 @@ {% block nav %}{{ navbar() }}{% endblock %}
- +