diff --git a/lookup.py b/lookup.py
index dc08a7b..1e98913 100755
--- a/lookup.py
+++ b/lookup.py
@@ -1,11 +1,16 @@
#!/usr/bin/python3
-from flask import Flask, render_template, request, jsonify
+from flask import Flask, render_template, request, jsonify, redirect, url_for
import requests
import os
import json
import urllib.parse
import random
+import simplejson
+import psycopg2
+from geopy.distance import distance
+
+# select gid, code, name from scotland where st_contains(geom, ST_Transform(ST_SetSRID(ST_MakePoint(-4.177, 55.7644), 4326), 27700));
commons_cat_start = 'https://commons.wikimedia.org/wiki/Category:'
use_cache = False
@@ -23,6 +28,8 @@ headers = {
OVERPASS_URL = 'https://lz4.overpass-api.de'
wikidata_query_api_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
wikidata_url = 'https://www.wikidata.org/w/api.php'
+wd_entity = 'http://www.wikidata.org/entity/Q'
+city_of_london_qid = 'Q23311'
samples = [
(50.8326, -0.2689, 'Adur'),
@@ -82,24 +89,194 @@ def random_location():
return render_template('random.html', lat=lat, lon=lon, result=result, elements=elements)
-def do_lookup(elements, lat, lon):
- commons_cat = osm_lookup(elements)
+@app.route("/wikidata_tag")
+def wikidata_tag():
+ lat = float(request.args.get('lat'))
+ lon = float(request.args.get('lon'))
+
+ scotland_code = get_scotland_code(lat, lon)
+
+ if scotland_code:
+ rows = lookup_scottish_parish_in_wikidata(scotland_code)
+ hit = commons_from_rows(rows)
+ elements = []
+ result = build_dict(hit, lat, lon)
+ else:
+ elements = get_osm_elements(lat, lon)
+ result = do_lookup(elements, lat, lon)
+
+ return render_template('wikidata_tag.html', lat=lat, lon=lon, result=result, elements=elements)
+
+@app.route("/detail")
+def detail_page():
+ try:
+ lat, lon = [float(request.args.get(param)) for param in ('lat', 'lon')]
+ except TypeError:
+ return redirect(url_for('index'))
+ reply = lat_lon_to_wikidata(lat, lon)
+ return render_template('random.html', lat=lat, lon=lon, **reply)
+
+def bounding_box_area(element):
+ bbox = element['bounds']
+
+ x = distance((bbox['maxlat'], bbox['minlon']), (bbox['maxlat'], bbox['maxlon']))
+ y = distance((bbox['minlat'], bbox['maxlon']), (bbox['maxlat'], bbox['minlon']))
+
+ return x.km * y.km
+
+def wd_to_qid(wd):
+ # expecting {'type': 'url', 'value': 'https://www.wikidata.org/wiki/Q30'}
+ if wd['type'] == 'uri':
+ return wd_uri_to_qid(wd['value'])
+
+def wd_uri_to_qid(value):
+ assert value.startswith(wd_entity)
+ return value[len(wd_entity) - 1:]
+
+def build_dict(hit, lat, lon):
coords = {'lat': lat, 'lon': lon}
- if commons_cat is None:
+ if hit is None:
return dict(commons_cat=None, missing=True, coords=coords)
+ commons_cat = hit['commons_cat']
url = commons_cat_start + urllib.parse.quote(commons_cat.replace(' ', '_'))
- return dict(commons_cat={'title': commons_cat, 'url': url}, coords=coords)
+ return dict(commons_cat={'title': commons_cat, 'url': url},
+ coords=coords,
+ admin_level=hit.get('admin_level'),
+ wikidata=hit['wikidata'])
+
+
+def do_lookup(elements, lat, lon):
+ try:
+ hit = osm_lookup(elements, lat, lon)
+ except QueryError as e:
+ return {
+ 'query': e.query,
+ 'error': e.r.text,
+ 'query_url': 'https://query.wikidata.org/#' + e.query,
+ }
+
+ return build_dict(hit, lat, lon)
+
+def get_scotland_code(lat, lon):
+ conn = psycopg2.connect(dbname='geocode', user='geocode', password='ooK3ohgh', host='localhost')
+ cur = conn.cursor()
+
+ point = f'ST_Transform(ST_SetSRID(ST_MakePoint({lon}, {lat}), 4326), 27700)'
+ cur.execute(f'select code, name from scotland where st_contains(geom, {point});')
+ row = cur.fetchone()
+
+ # expand search, disabled for now 2020-04-20
+ if not row:
+ cur.execute(f'select code, name from scotland where ST_DWithin(geom, {point}, 100);')
+ row = cur.fetchone()
+
+ conn.close()
+ if row:
+ return row[0]
+
+def wdqs_geosearch_query(lat, lon):
+ if isinstance(lat, float):
+ lat = f'{lat:f}'
+ if isinstance(lon, float):
+ lon = f'{lon:f}'
+
+ query_template = '''
+
+SELECT DISTINCT ?item ?distance ?itemLabel ?isa ?isaLabel ?commonsCat ?commonsSiteLink WHERE {
+ {
+ SELECT DISTINCT ?item ?location ?distance ?isa WHERE {
+ ?item wdt:P31/wdt:P279* wd:Q486972.
+ ?item wdt:P31 ?isa .
+ SERVICE wikibase:around {
+ ?item wdt:P625 ?location.
+ bd:serviceParam wikibase:center "Point(LON LAT)"^^geo:wktLiteral;
+ wikibase:radius 5;
+ wikibase:distance ?distance.
+ }
+ }
+ }
+ MINUS { ?item wdt:P582 ?endTime . }
+ OPTIONAL { ?item wdt:P373 ?commonsCat. }
+ OPTIONAL { ?commonsSiteLink schema:about ?item;
+ schema:isPartOf . }
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
+} ORDER BY (?distance)'''
+
+ query = query_template.replace('LAT', lat).replace('LON', lon)
+ reply = wdqs(query)
+ return reply['results']['bindings']
+
+def wdqs_geosearch(lat, lon):
+ default_max_dist = 1
+ rows = wdqs_geosearch_query(lat, lon)
+ max_dist = {
+ 'Q188509': 1, # suburb
+ 'Q3957': 2, # town
+ 'Q532': 1, # village
+ 'Q5084': 1, # hamlet
+ 'Q515': 2, # city
+ 'Q1549591': 3, # big city
+ }
+ for row in rows:
+ isa = wd_uri_to_qid(row['isa']['value'])
+
+ if ('commonsCat' not in row and 'commonsSiteLink' not in row and isa not in max_dist):
+ continue
+
+ distance = float(row['distance']['value'])
+ if distance > max_dist.get(isa, default_max_dist):
+ continue
+
+ if 'commonsCat' not in row and 'commonsSiteLink' not in row:
+ break
+
+ return row
+
+def lat_lon_to_wikidata(lat, lon):
+ scotland_code = get_scotland_code(lat, lon)
+
+ if scotland_code:
+ rows = lookup_scottish_parish_in_wikidata(scotland_code)
+ hit = commons_from_rows(rows)
+ elements = []
+ result = build_dict(hit, lat, lon)
+
+ return {'elements': elements, 'result': result}
+
+ elements = get_osm_elements(lat, lon)
+ result = do_lookup(elements, lat, lon)
+
+ # special case because the City of London is admin_level=6 in OSM
+ if result['wikidata'] == city_of_london_qid:
+ return {'elements': elements, 'result': result}
+
+ admin_level = result['admin_level']
+
+ if not admin_level or admin_level >= 7:
+ return {'elements': elements, 'result': result}
+
+ row = wdqs_geosearch(lat, lon)
+ if row:
+ hit = commons_from_rows([row])
+ elements = []
+ result = build_dict(hit, lat, lon)
+
+ return {'elements': elements, 'result': result}
@app.route("/")
def index():
+ q = request.args.get('q')
+ if q and q.strip():
+ lat, lon = [v.strip() for v in q.split(',', 1)]
+ return redirect(url_for('detail_page', lat=lat, lon=lon))
+
lat = request.args.get('lat')
lon = request.args.get('lon')
if lat is None or lon is None:
+ samples.sort(key=lambda row: row[2])
return render_template('index.html', samples=samples)
- elements = get_osm_elements(lat, lon)
- ret = do_lookup(elements, lat, lon)
- return jsonify(ret)
+ return jsonify(lat_lon_to_wikidata(lat, lon)['result'])
def wikidata_api_call(params):
call_params = {
@@ -160,30 +337,72 @@ out bb tags qt;'''
return run_query(oql)
-def lookup_gss_in_wikidata(gss):
+def lookup_scottish_parish_in_wikidata(code):
query = '''
SELECT ?item ?itemLabel ?commonsSiteLink ?commonsCat WHERE {
- ?item wdt:P836 "GSS" .
+ ?item wdt:P528 "CODE" .
+ ?item wdt:P31 wd:Q5124673 .
OPTIONAL { ?commonsSiteLink schema:about ?item ;
schema:isPartOf }
OPTIONAL { ?item wdt:P373 ?commonsCat }
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
-'''.replace('GSS', gss)
+'''.replace('CODE', code)
+ reply = wdqs(query)
+ return reply['results']['bindings']
+
+def lookup_gss_in_wikidata(gss):
+ query = '''
+SELECT ?item ?itemLabel ?commonsSiteLink ?commonsCat WHERE {
+ ?item wdt:P836 GSS .
+ OPTIONAL { ?commonsSiteLink schema:about ?item ;
+ schema:isPartOf }
+ OPTIONAL { ?item wdt:P373 ?commonsCat }
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
+}
+'''.replace('GSS', repr(gss))
+ reply = wdqs(query)
+ return reply['results']['bindings']
+
+def lookup_wikidata_by_name(name, lat, lon):
+ query = '''
+SELECT DISTINCT ?item ?itemLabel ?commonsSiteLink ?commonsCat WHERE {
+ ?item rdfs:label LABEL@en .
+ FILTER NOT EXISTS { ?item wdt:P31 wd:Q17362920 } .# ignore Wikimedia duplicated page
+ OPTIONAL { ?commonsSiteLink schema:about ?item ;
+ schema:isPartOf }
+ OPTIONAL { ?item wdt:P373 ?commonsCat }
+ ?item wdt:P625 ?coords .
+
+ FILTER(geof:distance(?coords, "Point(LON LAT)"^^geo:wktLiteral) < 10)
+ FILTER(?commonsCat || ?commonsSiteLink)
+
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
+}
+'''.replace('LABEL', repr(name)).replace('LAT', str(lat)).replace('LON', str(lon))
+
reply = wdqs(query)
return reply['results']['bindings']
def unescape_title(t):
return urllib.parse.unquote(t.replace('_', ' '))
-def get_commons_cat_from_gss(gss):
- rows = lookup_gss_in_wikidata(gss)
+def commons_from_rows(rows):
for row in rows:
if 'commonsCat' in row:
- return row['commonsCat']['value']
+ qid = wd_to_qid(row['item'])
+ return {'wikidata': qid,
+ 'commons_cat': row['commonsCat']['value']}
if 'commonsSiteLink' in row:
site_link = row['commonsSiteLink']['value']
- return unescape_title(site_link[len(commons_cat_start):])
+ qid = wd_to_qid(row['item'])
+ cat = unescape_title(site_link[len(commons_cat_start):])
+ return {'wikidata': qid, 'commons_cat': cat}
+
+def get_commons_cat_from_gss(gss):
+ print('GSS:', gss)
+ rows = lookup_gss_in_wikidata(gss)
+ return commons_from_rows(rows)
def get_osm_elements(lat, lon):
filename = f'cache/{lat}_{lon}.json'
@@ -198,24 +417,74 @@ def get_osm_elements(lat, lon):
return elements
-def osm_lookup(elements):
+def osm_lookup(elements, lat, lon):
is_in = []
+ elements.sort(key=lambda e: bounding_box_area(e))
+
+ if False:
+ for e in sorted(elements, key=lambda e: e['area']):
+ try:
+ admin_level = int(e['tags']['admin_level'])
+ except (ValueError, KeyError):
+ admin_level = None
+
+ if admin_level is None:
+ if e['id'] == 6038068: # Great Britain
+ continue
+ if 'place' in e['tags'] or e['tags'].get('type') == 'boundary':
+ is_in.append((99, e['tags']))
+ continue
+
+ is_in.append((admin_level, e['tags']))
+
+ # for _, tags in sorted(is_in, key=lambda i: i[0], reverse=True):
for e in elements:
- try:
- admin_level = int(e['tags']['admin_level'])
- except (ValueError, KeyError):
+ if 'tags' not in e:
+ continue
+ tags = e['tags']
+ admin_level_tag = tags.get('admin_level')
+ admin_level = int(admin_level_tag) if admin_level_tag and admin_level_tag.isdigit() else None
+ if not admin_level and tags.get('boundary') != 'political':
continue
-
- is_in.append((admin_level, e['tags']))
-
- for _, tags in sorted(is_in, key=lambda i: i[0], reverse=True):
if 'wikidata' in tags:
qid = tags['wikidata']
- return qid_to_commons_category(qid)
+ commons = qid_to_commons_category(qid)
+ if commons:
+ return {
+ 'wikidata': qid,
+ 'commons_cat': commons,
+ 'admin_level': admin_level,
+ }
gss = tags.get('ref:gss')
- if not gss:
+ if gss:
+ ret = get_commons_cat_from_gss(gss)
+ if ret:
+ ret['admin_level'] = admin_level
+ return ret
+
+ name = tags.get('name')
+ if not name:
continue
- return get_commons_cat_from_gss(gss)
+ if name.endswith(' CP'):
+ name = name[:-3]
+ rows = lookup_wikidata_by_name(name, lat, lon)
+
+ if len(rows) == 1:
+ ret = commons_from_rows(rows)
+ if ret:
+ ret['admin_level'] = admin_level
+ return ret
+
+ has_wikidata_tag = [e['tags'] for e in elements if 'wikidata' in e['tags']]
+ if len(has_wikidata_tag) != 1:
+ return
+
+ qid = has_wikidata_tag[0]['wikidata']
+ return {
+ 'wikidata': qid,
+ 'commons_cat': qid_to_commons_category(qid),
+ 'admin_level': admin_level,
+ }
if __name__ == '__main__':