From 867e77ab7b0225a1ab13bf2e801cb0a76046e89f Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 22 Jul 2021 14:47:38 +0200 Subject: [PATCH] Switch to SQLAlchemy Core for OSM objects osm2pgsql creates tables without primary keys. Some OSM objects are split into multiple table rows to help with PostGIS index performance. Adjust the code to be aware of this. Also add name matching for streets. --- matcher/api.py | 359 +++++++++++++++++++++++++++++----------------- matcher/planet.py | 27 ++++ web_view.py | 41 +++++- 3 files changed, 292 insertions(+), 135 deletions(-) create mode 100644 matcher/planet.py diff --git a/matcher/api.py b/matcher/api.py index 9f32802..e2b4e27 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -1,5 +1,10 @@ -from sqlalchemy import func, or_, and_ +from sqlalchemy import func, or_, and_, text from sqlalchemy.orm import selectinload +from sqlalchemy.sql import select +from sqlalchemy.sql.expression import literal, union, cast, column +from sqlalchemy.types import Float +from sqlalchemy.dialects import postgresql +from matcher.planet import point, line, polygon from matcher import model, database, wikidata_api, wikidata from matcher.data import extra_keys from collections import Counter, defaultdict @@ -41,37 +46,61 @@ def is_street_number_first(lat, lon): def make_envelope(bounds): return func.ST_MakeEnvelope(*bounds, srid) - def get_bbox_centroid(bbox): bbox = make_envelope(bbox) centroid = database.session.query(func.ST_AsText(func.ST_Centroid(bbox))).scalar() return reversed(re_point.match(centroid).groups()) +def make_envelope_around_point(lat, lon, distance): + conn = database.session.connection() + + p = func.ST_MakePoint(lon, lat) + + s = select([ + func.ST_AsText(func.ST_Project(p, distance, func.radians(0))), + func.ST_AsText(func.ST_Project(p, distance, func.radians(90))), + func.ST_AsText(func.ST_Project(p, distance, func.radians(180))), + func.ST_AsText(func.ST_Project(p, distance, func.radians(270))), + ]) + row = conn.execute(s).fetchone() + coords = [[float(v) for v in re_point.match(i).groups()] for i in row] + + north = coords[0][1] + east = coords[1][0] + south = coords[2][1] + west = coords[3][0] + + return func.ST_MakeEnvelope(west, south, east, north, srid) + def drop_way_area(tags): if "way_area" in tags: del tags["way_area"] return tags -def get_part_of(thing, bbox): - db_bbox = make_envelope(bbox) - ewkt = thing.as_EWKT - q = model.Polygon.query.filter( - func.ST_Intersects(db_bbox, model.Polygon.way), - func.ST_Covers(model.Polygon.way, func.ST_GeomFromEWKT(ewkt)), - or_( - model.Polygon.tags.has_key("landuse"), - model.Polygon.tags.has_key("amenity"), - ), - model.Polygon.tags.has_key("name"), - ) +def get_part_of(table_name, src_id, bbox): + table_map = {'point': point, 'line': line, 'polygon': polygon} + table_alias = table_map[table_name].alias() + s = (select([polygon.c.osm_id, + polygon.c.tags, + func.ST_Area(func.ST_Collect(polygon.c.way))]). + where(and_(func.ST_Intersects(bbox, polygon.c.way), + func.ST_Covers(polygon.c.way, table_alias.c.way), + table_alias.c.osm_id == src_id, + polygon.c.tags.has_key("name"), + or_( + polygon.c.tags.has_key("landuse"), + polygon.c.tags.has_key("amenity"), + ))). + group_by(polygon.c.osm_id, polygon.c.tags)) + + conn = database.session.connection() return [{ - "type": polygon.type, - "id": polygon.id, - "tags": drop_way_area(polygon.tags), - "area": polygon.area, - } for polygon in q] - + "type": "way" if osm_id > 0 else "relation", + "id": abs(osm_id), + "tags": tags, + "area": area, + } for osm_id, tags, area in conn.execute(s)] def get_and_save_item(qid): entity = wikidata_api.get_entity(qid) @@ -127,34 +156,51 @@ def get_items_in_bbox(bbox): def get_osm_with_wikidata_tag(bbox): - db_bbox = make_envelope(bbox) + bbox_str = ','.join(str(v) for v in bbox) + + # easier than building this query with SQLAlchemy + sql = f''' +SELECT tbl, osm_id, tags, ARRAY[ST_Y(centroid), ST_X(centroid)], geojson +FROM ( + SELECT 'point' as tbl, osm_id, tags, ST_AsText(ST_Centroid(way)) as centroid, ST_AsGeoJSON(way) as geojson + FROM planet_osm_point + WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way) +UNION + SELECT 'line' as tbl, osm_id, tags, ST_AsText(ST_Centroid(ST_Collect(way))) AS centroid, ST_AsGeoJSON(ST_Collect(way)) AS geojson + FROM planet_osm_line + WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way) + GROUP BY osm_id, tags +UNION + SELECT 'polygon' as tbl, osm_id, tags, ST_AsText(ST_Centroid(ST_Collect(way))) AS centroid, ST_AsGeoJSON(ST_Collect(way)) AS geojson + FROM planet_osm_polygon + WHERE ST_Intersects(ST_MakeEnvelope({bbox_str}, {srid}), way) + GROUP BY osm_id, tags + HAVING st_area(st_collect(way)) < 20 * st_area(ST_MakeEnvelope({bbox_str}, {srid})) +) as anon +WHERE tags ? 'wikidata' +''' + conn = database.session.connection() + result = conn.execute(text(sql)) tagged = [] + for tbl, osm_id, tags, centroid, geojson in result: + if tbl == 'point': + osm_type = "node" + else: + osm_type = "way" if osm_id > 0 else "relation" + osm_id = abs(osm_id) - seen = set() - for cls in (model.Point, model.Polygon, model.Line): - q = cls.query.filter( - cls.tags.has_key("wikidata"), - func.ST_Intersects(db_bbox, cls.way), - func.ST_Area(cls.way) < 20 * func.ST_Area(db_bbox), - ) - for osm in q: - if osm.identifier in seen: - continue - seen.add(osm.identifier) - name = osm.name or osm.tags.get("addr:housename") or "[no label]" + name = tags.get("name") or tags.get("addr:housename") or "[no label]" - tagged.append( - { - "identifier": osm.identifier, - "id": osm.id, - "type": osm.type, - "geojson": osm.geojson(), - "centroid": list(osm.get_centroid()), - "name": name, - "wikidata": osm.tags["wikidata"], - } - ) + tagged.append({ + "identifier": f"{osm_type}/{osm_id}", + "id": osm_id, + "type": osm_type, + "geojson": json.loads(geojson), + "centroid": centroid, + "name": name, + "wikidata": tags["wikidata"], + }) return tagged @@ -250,73 +296,18 @@ def wikidata_isa_counts(bounds): return isa_count -def get_tag_filter(cls, tag_list): +def get_tag_filter(tags, tag_list): tag_filter = [] for tag_or_key in tag_list: if tag_or_key.startswith("Key:"): - tag_filter.append(and_(cls.tags.has_key(tag_or_key[4:]), - cls.tags[tag_or_key[4:]] != 'no')) + tag_filter.append(and_(tags.has_key(tag_or_key[4:]), + tags[tag_or_key[4:]] != 'no')) if tag_or_key.startswith("Tag:"): k, _, v = tag_or_key.partition("=") - tag_filter.append(cls.tags[k[4:]] == v) + tag_filter.append(tags[k[4:]] == v) return tag_filter -def get_nearby(bbox, item, limit=60, max_distance=400): - db_bbox = make_envelope(bbox) - - osm_objects = {} - distances = {} - tag_list = get_item_tags(item) - if not tag_list: - return [] - - item_is_street = item.is_street() - - for loc in item.locations: - lat, lon = loc.get_lat_lon() - point = func.ST_SetSRID(func.ST_MakePoint(lon, lat), 4326) - for cls in model.Point, model.Line, model.Polygon: - if item_is_street and cls == model.Point: - continue - - tag_filter = get_tag_filter(cls, tag_list) - dist = func.ST_DistanceSphere(point, cls.way) - - q = (cls.query.add_columns(dist.label('distance')) - .filter( - func.ST_Intersects(db_bbox, cls.way), - func.ST_Area(cls.way) < 20 * func.ST_Area(db_bbox), - or_(*tag_filter)) - .order_by(point.distance_centroid(cls.way))) - - if item_is_street: - q = q.filter(cls.tags.has_key("name"), - cls.tags["highway"] != 'bus_stop') - - if "Key:amenity" in tag_list: - q = q.filter(cls.tags["amenity"] != "bicycle_parking", - cls.tags["amenity"] != "bicycle_repair_station", - cls.tags["amenity"] != "atm", - cls.tags["amenity"] != "recycling") - - q = q.limit(limit) - - # print(q.statement.compile(compile_kwargs={"literal_binds": True})) - - for i, dist in q: - if dist > max_distance: - continue - osm_objects.setdefault(i.identifier, i) - if i.identifier not in distances or dist < distances[i.identifier]: - distances[i.identifier] = dist - - nearby = [(osm_objects[identifier], dist) - for identifier, dist - in sorted(distances.items(), key=lambda i:i[1])] - - return nearby[:limit] - def get_preset_translations(): app = current_app @@ -338,15 +329,13 @@ def get_preset_translations(): return {} -def get_presets_from_tags(osm): +def get_presets_from_tags(ending, tags): translations = get_preset_translations() found = [] - endings = {model.Point: "point", model.Line: "line", model.Polygon: "area"} - ending = endings[type(osm)] - for k, v in osm.tags.items(): - if k == 'amenity' and v == 'clock' and osm.tags.get('display') == 'sundial': + for k, v in tags.items(): + if k == 'amenity' and v == 'clock' and tags.get('display') == 'sundial': tag_or_key = f"Tag:{k}={v}" found.append({"tag_or_key": tag_or_key, "name": "Sundial"}) continue @@ -416,7 +405,7 @@ def address_from_tags(tags): return " ".join(tags["addr:" + k] for k in keys) -def get_address_nodes_within_building(building, bbox): +def old_get_address_nodes_within_building(building, bbox): db_bbox = make_envelope(bbox) ewkt = building.as_EWKT q = model.Point.query.filter( @@ -428,45 +417,149 @@ def get_address_nodes_within_building(building, bbox): return [node.tags for node in q] +def get_address_nodes_within_building(osm_id, bbox): + q = model.Point.query.filter( + polygon.c.osm_id == osm_id, + func.ST_Intersects(bbox, model.Point.way), + func.ST_Covers(polygon.c.way, model.Point.way), + model.Point.tags.has_key("addr:street"), + model.Point.tags.has_key("addr:housenumber"), + ) -def find_osm_candidates(item, bounds): - check_is_street_number_first(bounds) + return [node.tags for node in q] + +def osm_display_name(tags): + keys = ("bridge:name", "tunnel:name", "lock_name", "name", "addr:housename", + "inscription") + for key in keys: + if key in tags: + return tags[key] + + +def street_address_in_tags(tags): + return "addr:housenumber" in tags and "addr:street" in tags + +def find_osm_candidates(item, limit=60, max_distance=400, names=None): + item_id = item.item_id + item_is_street = item.is_street() + print("is street:", item_is_street) + + check_is_street_number_first(item.locations[0].get_lat_lon()) + + db_bbox = or_(*[make_envelope_around_point(*loc.get_lat_lon(), max_distance) + for loc in item.locations]) + + null_area = cast(None, Float) + dist = column('dist') + tags = column('tags', postgresql.HSTORE) + + tag_list = get_item_tags(item) + + s_point = (select([literal('point'), point.c.osm_id, point.c.tags.label('tags'), + func.min(func.ST_DistanceSphere(model.ItemLocation.location, point.c.way)).label('dist'), + func.ST_AsText(point.c.way), + func.ST_AsGeoJSON(point.c.way), + null_area]). + where(and_( + func.ST_Intersects(db_bbox, point.c.way), + model.ItemLocation.item_id == item_id, + or_(*get_tag_filter(point.c.tags, tag_list)))). + group_by(point.c.osm_id, point.c.tags, point.c.way)) + + s_line = (select([literal('line'), line.c.osm_id, line.c.tags.label('tags'), + func.min(func.ST_DistanceSphere(model.ItemLocation.location, line.c.way)).label('dist'), + func.ST_AsText(func.ST_Centroid(func.ST_Collect(line.c.way))), + func.ST_AsGeoJSON(func.ST_Collect(line.c.way)), + null_area]). + where(and_( + func.ST_Intersects(db_bbox, line.c.way), + model.ItemLocation.item_id == item_id, + or_(*get_tag_filter(line.c.tags, tag_list)))). + group_by(line.c.osm_id, line.c.tags)) + + s_polygon = (select([literal('polygon'), polygon.c.osm_id, polygon.c.tags.label('tags'), + func.min(func.ST_DistanceSphere(model.ItemLocation.location, polygon.c.way)).label('dist'), + func.ST_AsText(func.ST_Centroid(func.ST_Collect(polygon.c.way))), + func.ST_AsGeoJSON(func.ST_Collect(polygon.c.way)), + func.ST_Area(func.ST_Collect(polygon.c.way))]). + where(and_( + func.ST_Intersects(db_bbox, polygon.c.way), + model.ItemLocation.item_id == item_id, + or_(*get_tag_filter(polygon.c.tags, tag_list)))). + group_by(polygon.c.osm_id, polygon.c.tags). + having(func.ST_Area(func.ST_Collect(polygon.c.way)) < 20 * func.ST_Area(db_bbox))) + + tables = ([] if item_is_street else [s_point]) + [s_line, s_polygon] + s = select([union(*tables).alias()]).where(dist < max_distance).order_by(dist) + + if names: + s = s.where(tags["name"].in_(names)) + + if item_is_street: + s = s.where(tags["highway"] != "bus_stop") + if not names: + s = s.where(tags.has_key("name")) + + if "Key:amenity" in tag_list: + s = s.where(and_(tags["amenity"] != "bicycle_parking", + tags["amenity"] != "bicycle_repair_station", + tags["amenity"] != "atm", + tags["amenity"] != "recycling")) + + if limit: + s = s.limit(limit) + + print(s.compile(compile_kwargs={"literal_binds": True})) + + conn = database.session.connection() nearby = [] - for osm, dist in get_nearby(bounds, item): - tags = osm.tags + for table, src_id, tags, distance, centroid, geojson, area in conn.execute(s): + osm_id = src_id + if table == "point": + osm_type = "node" + elif osm_id > 0: + osm_type = "way" + else: + osm_type = "relation" + osm_id = -osm_id + tags.pop("way_area", None) - name = osm.display_name() - if not name and osm.has_street_address: + name = osm_display_name(tags) + if not name and street_address_in_tags(tags): name = address_from_tags(tags) - if isinstance(osm, model.Polygon) and "building" in osm.tags: - address_nodes = get_address_nodes_within_building(osm, bounds) + if table == "polygon" and "building" in tags: + address_nodes = get_address_nodes_within_building(src_id, db_bbox) address_list = [address_from_tags(addr) for addr in address_nodes] else: address_list = [] + + shape = "area" if table == "polygon" else table + cur = { - "identifier": osm.identifier, - "type": osm.type, - "id": osm.id, - "distance": dist, + "identifier": f"{osm_type}/{osm_id}", + "type": osm_type, + "id": osm_id, + "distance": distance, "name": name, "tags": tags, - "geojson": osm.geojson(), - "presets": get_presets_from_tags(osm), + "geojson": json.loads(geojson), + "presets": get_presets_from_tags(shape, tags), "address_list": address_list, - "centroid": list(osm.get_centroid()), + "centroid": list(reversed(re_point.match(centroid).groups())), } - if hasattr(osm, 'area'): - cur["area"] = osm.area + if area is not None: + cur["area"] = area + + part_of = [i for i in get_part_of(table, src_id, db_bbox) + if i["tags"]["name"] != name] + if part_of: + cur["part_of"] = part_of if address := address_from_tags(tags): cur["address"] = address - part_of = [i for i in get_part_of(osm, bounds) if i["tags"]["name"] != name] - if part_of: - cur["part_of"] = part_of - nearby.append(cur) return nearby @@ -503,8 +596,8 @@ def get_item_street_addresses(item): return street_address -def check_is_street_number_first(bounds): - g.street_number_first = is_street_number_first(*get_bbox_centroid(bounds)) +def check_is_street_number_first(latlng): + g.street_number_first = is_street_number_first(*latlng) def item_detail(item): locations = [list(i.get_lat_lon()) for i in item.locations] @@ -537,7 +630,7 @@ def get_markers(all_items): def wikidata_items(bounds): - check_is_street_number_first(bounds) + check_is_street_number_first(get_bbox_centroid(bounds)) q = get_items_in_bbox(bounds) db_items = q.all() items = get_markers(db_items) diff --git a/matcher/planet.py b/matcher/planet.py new file mode 100644 index 0000000..57f7b44 --- /dev/null +++ b/matcher/planet.py @@ -0,0 +1,27 @@ +from sqlalchemy import Table, Column, Integer, String, Float, MetaData +from sqlalchemy.dialects import postgresql +from geoalchemy2 import Geometry + +metadata = MetaData() + +point = Table("planet_osm_point", metadata, + Column("osm_id", Integer), + Column("name", String), + Column("tags", postgresql.HSTORE), + Column("way", Geometry("GEOMETRY", srid=4326, spatial_index=True), nullable=False), +) + +line = Table("planet_osm_line", metadata, + Column("osm_id", Integer), + Column("name", String), + Column("tags", postgresql.HSTORE), + Column("way", Geometry("GEOMETRY", srid=4326, spatial_index=True), nullable=False), +) + +polygon = Table("planet_osm_polygon", metadata, + Column("osm_id", Integer), + Column("name", String), + Column("tags", postgresql.HSTORE), + Column("way", Geometry("GEOMETRY", srid=4326, spatial_index=True), nullable=False), + Column("way_area", Float), +) diff --git a/web_view.py b/web_view.py index 2120104..7ff7f61 100755 --- a/web_view.py +++ b/web_view.py @@ -343,17 +343,54 @@ def api_get_item_tags(item_id): duration=t1) +def expand_street_name(name): + ret = {name} + if any(name.startswith(st) for st in ('St ', 'St. ')): + first_space = name.find(' ') + ret.add("Saint" + name[first_space:]) + + if ', ' in name: + for n in set(ret): + comma = n.find(", ") + ret.add(name[:comma]) + elif '/' in name: + for n in set(ret): + ret.extend(part.strip() for part in n.split("/")) + + return ret + + @app.route("/api/1/item/Q/candidates") def api_find_osm_candidates(item_id): t0 = time() - bounds = read_bounds_param() item = model.Item.query.get(item_id) if not item: return cors_jsonify(success=True, qid=f'Q{item_id}', error="item doesn't exist") - nearby = api.find_osm_candidates(item, bounds) + label = item.label() + item_is_street = item.is_street() + + if item_is_street: + max_distance = 2_000 + limit = None + names = expand_street_name(label) + else: + max_distance = 400 + limit = 60 + names = None + nearby = api.find_osm_candidates(item, + limit=limit, + max_distance=max_distance, + names=names) + + if item_is_street and not nearby: + # nearby = [osm for osm in nearby if street_name_match(label, osm)] + + # try again without name filter + nearby = api.find_osm_candidates(item, limit=100, + max_distance=1_000) t1 = time() - t0 return cors_jsonify(success=True, qid=item.qid, nearby=nearby, duration=t1)