diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6a7802f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +.mypy_cache/ diff --git a/geocode/database.py b/geocode/database.py new file mode 100644 index 0000000..dc667bf --- /dev/null +++ b/geocode/database.py @@ -0,0 +1,25 @@ +from sqlalchemy import create_engine, func +from sqlalchemy.orm import scoped_session, sessionmaker + +session = scoped_session(sessionmaker()) + + +def init_db(db_url): + session.configure(bind=get_engine(db_url)) + + +def get_engine(db_url, echo=False): + return create_engine(db_url, pool_recycle=3600, echo=echo) + + +def init_app(app, echo=False): + db_url = app.config["DB_URL"] + session.configure(bind=get_engine(db_url, echo=echo)) + + @app.teardown_appcontext + def shutdown_session(exception=None): + session.remove() + + +def now_utc(): + return func.timezone("utc", func.now()) diff --git a/geocode/model.py b/geocode/model.py new file mode 100644 index 0000000..8230b0b --- /dev/null +++ b/geocode/model.py @@ -0,0 +1,57 @@ +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.schema import Column +from sqlalchemy.types import Integer, Float, Numeric, String +from sqlalchemy.dialects import postgresql +from sqlalchemy.orm import column_property +from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy import func, cast + +from geoalchemy2 import Geometry +from .database import session + +Base = declarative_base() +Base.query = session.query_property() + +class Polygon(Base): + __tablename__ = "planet_osm_polygon" + + osm_id = Column(Integer, primary_key=True, autoincrement=False) + admin_level = Column(String) + + way_area = Column(Float) + tags = Column(postgresql.HSTORE) + way = Column(Geometry("GEOMETRY", srid=4326, spatial_index=True), nullable=False) + area = column_property(func.ST_Area(way, False)) + + @property + def osm_url(self): + osm_type = "way" if self.osm_id > 0 else "relation" + return f"https://www.openstreetmap.org/{osm_type}/{abs(self.osm_id)}" + + @hybrid_property + def area_in_sq_km(self): + return self.area / (1000 * 1000) + + @classmethod + def coords_within(cls, lat, lon): + point = func.ST_SetSRID(func.ST_MakePoint(lon, lat), 4326) + return (cls.query.filter(cls.admin_level.isnot(None), + cls.admin_level.regexp_match("^\d+$"), + func.ST_Within(point, cls.way)) + .order_by(cls.area, cast(cls.admin_level, Integer).desc())) + +class Scotland(Base): + __tablename__ = "scotland" + + gid = Column(Integer, primary_key=True) + shape_leng = Column(Numeric) + shape_area = Column(Numeric) + code = Column(String(3)) + c91code1 = Column(String(5)) + c91code2 = Column(String(5)) + c91code3 = Column(String(5)) + c91code4 = Column(String(5)) + name = Column(String(50)) + + geom = Column(Geometry("MULTIPOLYGON", srid=27700)) + diff --git a/geocode/overpass.py b/geocode/overpass.py deleted file mode 100644 index 860ea2a..0000000 --- a/geocode/overpass.py +++ /dev/null @@ -1,36 +0,0 @@ -from flask import current_app -from . import headers -import os -import json -import requests - -OVERPASS_URL = "https://lz4.overpass-api.de" - - -def run_query(oql): - return requests.post( - OVERPASS_URL + "/api/interpreter", data=oql.encode("utf-8"), headers=headers - ) - - -def is_in_lat_lon(lat, lon): - oql = f""" -[out:json][timeout:25]; -is_in({lat},{lon})->.a; -(way(pivot.a); rel(pivot.a);); -out bb tags qt;""" - - return run_query(oql) - - -def get_osm_elements(lat, lon): - filename = f"cache/{lat}_{lon}.json" - use_cache = current_app.config["USE_CACHE"] - - if use_cache and os.path.exists(filename): - return json.load(open(filename))["elements"] - - r = is_in_lat_lon(lat, lon) - if use_cache: - open(filename, "wb").write(r.content) - return r.json()["elements"] diff --git a/geocode/scotland.py b/geocode/scotland.py new file mode 100644 index 0000000..60c9b43 --- /dev/null +++ b/geocode/scotland.py @@ -0,0 +1,12 @@ +from flask import current_app +import psycopg2 + +def get_scotland_code(lat, lon): + conn = psycopg2.connect(**current_app.config["DB_PARAMS"]) + cur = conn.cursor() + + point = f"ST_Transform(ST_SetSRID(ST_MakePoint({lon}, {lat}), 4326), 27700)" + cur.execute(f"select code, name from scotland where st_contains(geom, {point});") + row = cur.fetchone() + conn.close() + return row[0] if row else None diff --git a/geocode/wikidata.py b/geocode/wikidata.py index 33ea0d2..7588816 100644 --- a/geocode/wikidata.py +++ b/geocode/wikidata.py @@ -1,8 +1,12 @@ +from flask import render_template import requests import simplejson from . import headers +import urllib.parse wikidata_query_api_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" +wd_entity = "http://www.wikidata.org/entity/Q" +commons_cat_start = "https://commons.wikimedia.org/wiki/Category:" class QueryError(Exception): @@ -49,3 +53,110 @@ def wdqs(query): return r.json()["results"]["bindings"] except simplejson.errors.JSONDecodeError: raise QueryError(query, r) + + +def wd_to_qid(wd): + # expecting {"type": "url", "value": "https://www.wikidata.org/wiki/Q30"} + if wd["type"] == "uri": + return wd_uri_to_qid(wd["value"]) + + +def wd_uri_to_qid(value): + assert value.startswith(wd_entity) + return value[len(wd_entity) - 1 :] + +def geosearch_query(lat, lon): + if isinstance(lat, float): + lat = f"{lat:f}" + if isinstance(lon, float): + lon = f"{lon:f}" + + query = render_template("sparql/geosearch.sparql", lat=lat, lon=lon) + return wdqs(query) + + +def geosearch(lat, lon): + default_max_dist = 1 + rows = geosearch_query(lat, lon) + max_dist = { + "Q188509": 1, # suburb + "Q3957": 2, # town + "Q532": 1, # village + "Q5084": 1, # hamlet + "Q515": 2, # city + "Q1549591": 3, # big city + } + for row in rows: + isa = wd_uri_to_qid(row["isa"]["value"]) + + if ( + "commonsCat" not in row + and "commonsSiteLink" not in row + and isa not in max_dist + ): + continue + + distance = float(row["distance"]["value"]) + if distance > max_dist.get(isa, default_max_dist): + continue + + if "commonsCat" not in row and "commonsSiteLink" not in row: + break + + return row + + +def lookup_scottish_parish_in_wikidata(code): + query = render_template("sparql/scottish_parish.sparql", code=code) + return wdqs(query) + + +def lookup_gss_in_wikidata(gss): + query = render_template("sparql/lookup_gss.sparql", gss=gss) + return wdqs(query) + + +def lookup_wikidata_by_name(name, lat, lon): + query = render_template( + "sparql/lookup_by_name.sparql", name=repr(name), lat=str(lat), lon=str(lon) + ) + return wdqs(query) + + +def unescape_title(t): + return urllib.parse.unquote(t.replace("_", " ")) + + +def commons_from_rows(rows): + for row in rows: + if "commonsCat" in row: + qid = wd_to_qid(row["item"]) + return {"wikidata": qid, "commons_cat": row["commonsCat"]["value"]} + if "commonsSiteLink" in row: + site_link = row["commonsSiteLink"]["value"] + qid = wd_to_qid(row["item"]) + cat = unescape_title(site_link[len(commons_cat_start) :]) + return {"wikidata": qid, "commons_cat": cat} + + +def get_commons_cat_from_gss(gss): + return commons_from_rows(lookup_gss_in_wikidata(gss)) + + +def build_dict(hit, lat, lon): + coords = {"lat": lat, "lon": lon} + if hit is None: + return dict(commons_cat=None, missing=True, coords=coords) + commons_cat = hit["commons_cat"] + ret = dict( + coords=coords, + admin_level=hit.get("admin_level"), + wikidata=hit["wikidata"], + ) + if not commons_cat: + return ret + + url = commons_cat_start + urllib.parse.quote(commons_cat.replace(" ", "_")) + ret["commons_cat"] = {"title": commons_cat, "url": url} + + return ret diff --git a/lookup.py b/lookup.py index e255e56..c75d1d0 100755 --- a/lookup.py +++ b/lookup.py @@ -1,24 +1,14 @@ #!/usr/bin/python3 from flask import Flask, render_template, request, jsonify, redirect, url_for +from geocode import wikidata, scotland, database, model import geocode -import geocode.wikidata -import geocode.overpass -import urllib.parse import random -import psycopg2 -from geopy.distance import distance -# select gid, code, name from scotland where st_contains(geom, ST_Transform(ST_SetSRID(ST_MakePoint(-4.177, 55.7644), 4326), 27700)); - -commons_cat_start = "https://commons.wikimedia.org/wiki/Category:" - -wd_entity = "http://www.wikidata.org/entity/Q" city_of_london_qid = "Q23311" - - app = Flask(__name__) app.config.from_object("config.default") +database.init_app(app) def get_random_lat_lon(): @@ -33,191 +23,54 @@ def get_random_lat_lon(): return lat, lon -def bounding_box_area(element): - bbox = element["bounds"] - - x = distance((bbox["maxlat"], bbox["minlon"]), (bbox["maxlat"], bbox["maxlon"])) - y = distance((bbox["minlat"], bbox["maxlon"]), (bbox["maxlat"], bbox["minlon"])) - - return x.km * y.km - - -def wd_to_qid(wd): - # expecting {"type": "url", "value": "https://www.wikidata.org/wiki/Q30"} - if wd["type"] == "uri": - return wd_uri_to_qid(wd["value"]) - - -def wd_uri_to_qid(value): - assert value.startswith(wd_entity) - return value[len(wd_entity) - 1 :] - - -def build_dict(hit, lat, lon): - coords = {"lat": lat, "lon": lon} - if hit is None: - return dict(commons_cat=None, missing=True, coords=coords) - commons_cat = hit["commons_cat"] - url = commons_cat_start + urllib.parse.quote(commons_cat.replace(" ", "_")) - return dict( - commons_cat={"title": commons_cat, "url": url}, - coords=coords, - admin_level=hit.get("admin_level"), - wikidata=hit["wikidata"], - ) - - def do_lookup(elements, lat, lon): try: hit = osm_lookup(elements, lat, lon) - except geocode.wikidata.QueryError as e: + except wikidata.QueryError as e: return { "query": e.query, "error": e.r.text, "query_url": "https://query.wikidata.org/#" + e.query, } - return build_dict(hit, lat, lon) - - -def get_scotland_code(lat, lon): - conn = psycopg2.connect(**app.config["DB_PARAMS"]) - cur = conn.cursor() - - point = f"ST_Transform(ST_SetSRID(ST_MakePoint({lon}, {lat}), 4326), 27700)" - cur.execute(f"select code, name from scotland where st_contains(geom, {point});") - row = cur.fetchone() - - # expand search, disabled for now 2020-04-20 - if not row: - cur.execute( - f"select code, name from scotland where ST_DWithin(geom, {point}, 100);" - ) - row = cur.fetchone() - - conn.close() - if row: - return row[0] - - -def wdqs_geosearch_query(lat, lon): - if isinstance(lat, float): - lat = f"{lat:f}" - if isinstance(lon, float): - lon = f"{lon:f}" - - query = render_template("sparql/geosearch.sparql", lat=lat, lon=lon) - return geocode.wikidata.wdqs(query) - - -def wdqs_geosearch(lat, lon): - default_max_dist = 1 - rows = wdqs_geosearch_query(lat, lon) - max_dist = { - "Q188509": 1, # suburb - "Q3957": 2, # town - "Q532": 1, # village - "Q5084": 1, # hamlet - "Q515": 2, # city - "Q1549591": 3, # big city - } - for row in rows: - isa = wd_uri_to_qid(row["isa"]["value"]) - - if ( - "commonsCat" not in row - and "commonsSiteLink" not in row - and isa not in max_dist - ): - continue - - distance = float(row["distance"]["value"]) - if distance > max_dist.get(isa, default_max_dist): - continue - - if "commonsCat" not in row and "commonsSiteLink" not in row: - break - - return row + return wikidata.build_dict(hit, lat, lon) def lat_lon_to_wikidata(lat, lon): - scotland_code = get_scotland_code(lat, lon) + scotland_code = scotland.get_scotland_code(lat, lon) if scotland_code: - rows = lookup_scottish_parish_in_wikidata(scotland_code) - hit = commons_from_rows(rows) + rows = wikidata.lookup_scottish_parish_in_wikidata(scotland_code) + hit = wikidata.commons_from_rows(rows) elements = [] - result = build_dict(hit, lat, lon) + result = wikidata.build_dict(hit, lat, lon) return {"elements": elements, "result": result} - elements = geocode.overpass.get_osm_elements(lat, lon) + elements = model.Polygon.coords_within(lat, lon) result = do_lookup(elements, lat, lon) # special case because the City of London is admin_level=6 in OSM - if result["wikidata"] == city_of_london_qid: + if result.get("wikidata") == city_of_london_qid: return {"elements": elements, "result": result} - admin_level = result["admin_level"] + admin_level = result.get("admin_level") if not admin_level or admin_level >= 7: return {"elements": elements, "result": result} - row = wdqs_geosearch(lat, lon) + row = wikidata.geosearch(lat, lon) if row: - hit = commons_from_rows([row]) + hit = wikidata.commons_from_rows([row]) elements = [] - result = build_dict(hit, lat, lon) + result = wikidata.build_dict(hit, lat, lon) return {"elements": elements, "result": result} - -def lookup_scottish_parish_in_wikidata(code): - query = render_template("sparql/scottish_parish.sparql", code=code) - return geocode.wikidata.wdqs(query) - - -def lookup_gss_in_wikidata(gss): - query = render_template("sparql/lookup_gss.sparql", gss=gss) - return geocode.wikidata.wdqs(query) - - -def lookup_wikidata_by_name(name, lat, lon): - query = render_template( - "sparql/lookup_by_name.sparql", name=repr(name), lat=str(lat), lon=str(lon) - ) - return geocode.wikidata.wdqs(query) - - -def unescape_title(t): - return urllib.parse.unquote(t.replace("_", " ")) - - -def commons_from_rows(rows): - for row in rows: - if "commonsCat" in row: - qid = wd_to_qid(row["item"]) - return {"wikidata": qid, "commons_cat": row["commonsCat"]["value"]} - if "commonsSiteLink" in row: - site_link = row["commonsSiteLink"]["value"] - qid = wd_to_qid(row["item"]) - cat = unescape_title(site_link[len(commons_cat_start) :]) - return {"wikidata": qid, "commons_cat": cat} - - -def get_commons_cat_from_gss(gss): - return commons_from_rows(lookup_gss_in_wikidata(gss)) - - def osm_lookup(elements, lat, lon): - elements.sort(key=lambda e: bounding_box_area(e)) - for e in elements: - if "tags" not in e: - continue - tags = e["tags"] + tags = e.tags admin_level_tag = tags.get("admin_level") admin_level = ( int(admin_level_tag) @@ -228,7 +81,7 @@ def osm_lookup(elements, lat, lon): continue if "wikidata" in tags: qid = tags["wikidata"] - commons = geocode.wikidata.qid_to_commons_category(qid) + commons = wikidata.qid_to_commons_category(qid) if commons: return { "wikidata": qid, @@ -237,7 +90,7 @@ def osm_lookup(elements, lat, lon): } gss = tags.get("ref:gss") if gss: - ret = get_commons_cat_from_gss(gss) + ret = wikidata.get_commons_cat_from_gss(gss) if ret: ret["admin_level"] = admin_level return ret @@ -247,22 +100,22 @@ def osm_lookup(elements, lat, lon): continue if name.endswith(" CP"): name = name[:-3] - rows = lookup_wikidata_by_name(name, lat, lon) + rows = wikidata.lookup_wikidata_by_name(name, lat, lon) if len(rows) == 1: - ret = commons_from_rows(rows) + ret = wikidata.commons_from_rows(rows) if ret: ret["admin_level"] = admin_level return ret - has_wikidata_tag = [e["tags"] for e in elements if "wikidata" in e["tags"]] + has_wikidata_tag = [e.tags for e in elements if e.tags.get("wikidata")] if len(has_wikidata_tag) != 1: return qid = has_wikidata_tag[0]["wikidata"] return { "wikidata": qid, - "commons_cat": geocode.qid_to_commons_category(qid), + "commons_cat": wikidata.qid_to_commons_category(qid), "admin_level": admin_level, } @@ -287,7 +140,7 @@ def index(): def random_location(): lat, lon = get_random_lat_lon() - elements = geocode.overpass.get_osm_elements(lat, lon) + elements = model.Polygon.coords_within(lat, lon) result = do_lookup(elements, lat, lon) return render_template( @@ -300,15 +153,15 @@ def wikidata_tag(): lat = float(request.args.get("lat")) lon = float(request.args.get("lon")) - scotland_code = get_scotland_code(lat, lon) + scotland_code = scotland.get_scotland_code(lat, lon) if scotland_code: - rows = lookup_scottish_parish_in_wikidata(scotland_code) - hit = commons_from_rows(rows) + rows = wikidata.lookup_scottish_parish_in_wikidata(scotland_code) + hit = wikidata.commons_from_rows(rows) elements = [] - result = build_dict(hit, lat, lon) + result = wikidata.build_dict(hit, lat, lon) else: - elements = geocode.overpass.get_osm_elements(lat, lon) + elements = model.Polygon.coords_within(lat, lon) result = do_lookup(elements, lat, lon) return render_template( @@ -322,7 +175,18 @@ def detail_page(): lat, lon = [float(request.args.get(param)) for param in ("lat", "lon")] except TypeError: return redirect(url_for("index")) - reply = lat_lon_to_wikidata(lat, lon) + try: + reply = lat_lon_to_wikidata(lat, lon) + except wikidata.QueryError as e: + query, r = e.args + return render_template( + "query_error.html", + lat=lat, + lon=lon, + query=query, + r=r + ) + return render_template("detail.html", lat=lat, lon=lon, **reply) diff --git a/templates/detail.html b/templates/detail.html index b372a5c..941d8bf 100644 --- a/templates/detail.html +++ b/templates/detail.html @@ -32,7 +32,7 @@ {% for element in elements %} {% set tags = element.tags %} -
{{ element | pprint }}
+ {{ element.tags | pprint }}
{% endfor %}
diff --git a/templates/query_error.html b/templates/query_error.html
new file mode 100644
index 0000000..9c83655
--- /dev/null
+++ b/templates/query_error.html
@@ -0,0 +1,28 @@
+
+
+
+
+ visit endpoint + +| view in OSM + +| # + +
+ +{{ query }}
+
+{{ r.text }}
+
+
+
diff --git a/templates/sparql/geosearch.sparql b/templates/sparql/geosearch.sparql
index 433e072..55ea42f 100644
--- a/templates/sparql/geosearch.sparql
+++ b/templates/sparql/geosearch.sparql
@@ -12,8 +12,9 @@ SELECT DISTINCT ?item ?distance ?itemLabel ?isa ?isaLabel ?commonsCat ?commonsSi
}
}
MINUS { ?item wdt:P582 ?endTime . }
+ MINUS { ?item wdt:P31 wd:Q1497375 . }
OPTIONAL { ?item wdt:P373 ?commonsCat. }
OPTIONAL { ?commonsSiteLink schema:about ?item;
schema:isPartOf visit endpoint + +| view in OSM + +{% if result.commons_cat %} +| Commons category +{% endif %} + +{% if result.wikidata %} + | {{ result.wikidata }} +{% endif %} + +| # + +
+ +{{ result | pprint }}
+
+{% if result.commons_cat %}
+({{ lat }}, {{ lon }}, {{result.commons_cat.title | pprint }}),
+{% endif %} + +| {{ tags.name }} | +{{ tags.admin_level }} | +{{ tags.boundary }} | +{{ tags.designation }} | +{{ '{:,.0f}'.format(e.area_in_sq_km) }} km² | +{{ 'wikidata' in tags }} | +