From d4625fafa1781e7001aaa84c0d966ef79ad96642 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 18 Dec 2021 15:43:58 +0000 Subject: [PATCH 01/30] Update some versions --- templates/map.html | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/templates/map.html b/templates/map.html index 5e03c0f..39fcc3d 100644 --- a/templates/map.html +++ b/templates/map.html @@ -4,9 +4,9 @@ Wikidata items linked to OSM - + - + {% from "navbar.html" import navbar with context %} @@ -14,16 +14,18 @@ {% block nav %}{{ navbar() }}{% endblock %}
- + From 82b0405ab75ca5bd0cdcbb041b41092d4b26fa64 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 8 Apr 2022 10:46:59 +0100 Subject: [PATCH 12/30] Add missing templates --- templates/flash_msg.html | 12 ++++++++++++ templates/show_error.html | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 templates/flash_msg.html create mode 100644 templates/show_error.html diff --git a/templates/flash_msg.html b/templates/flash_msg.html new file mode 100644 index 0000000..76038fa --- /dev/null +++ b/templates/flash_msg.html @@ -0,0 +1,12 @@ +{% with messages = get_flashed_messages() %} + {% if messages %} + {% for message in messages %} + + {% endfor %} + {% endif %} +{% endwith %} diff --git a/templates/show_error.html b/templates/show_error.html new file mode 100644 index 0000000..13447e7 --- /dev/null +++ b/templates/show_error.html @@ -0,0 +1,36 @@ +{% extends "base.html" %} + +{% block style %} + +{% endblock %} + +{% block content %} + +
+
+
+ +

Software error: {{ tb.exception_type }}

+
+
{{ tb.exception }}
+
+ +{% set body %} +URL: {{ request.url }} + +{{ tb.plaintext | safe }} +{% endset %} + +

Submit as an issue on GitHub (requires an account with GitHub)

+ +

Traceback (most recent call last)

+{{ tb.render_summary(include_title=False) | safe }} + +

Error in function "{{ tb.frames[-1].function_name }}": {{ last_frame_args | pprint }}

+
{{ last_frame.locals | pprint }}
+ +
+
+
+ +{% endblock %} From a81cedaae0db131d3766229528f2f42478dc37f3 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 17 Apr 2022 17:06:46 +0100 Subject: [PATCH 13/30] Sometimes the address->country field is missing --- matcher/nominatim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matcher/nominatim.py b/matcher/nominatim.py index 5e70793..cfee85f 100644 --- a/matcher/nominatim.py +++ b/matcher/nominatim.py @@ -83,8 +83,8 @@ def get_hit_name(hit): if len(address) == 1: return n1 - country = address.pop("country") - country_code = address.pop("country_code") + country = address.pop("country", None) + country_code = address.pop("country_code", None) if country_code: country_code == country_code.lower() From 87005dea18c4288e7051b1048c88c92d01b67132 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:24:16 +0100 Subject: [PATCH 14/30] New API call to get items in a place --- matcher/api.py | 18 ++++++++++++++++++ web_view.py | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/matcher/api.py b/matcher/api.py index a2461a4..5de289e 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -914,3 +914,21 @@ def isa_incremental_search(search_terms): } ret.append(cur) return ret + +def get_place_items(osm_type, osm_id): + src_id = osm_id * {'way': 1, 'relation': -1}[osm_type] + + q = (model.Item.query + .join(model.ItemLocation) + .join(model.Polygon, func.ST_Covers(model.Polygon.way, model.ItemLocation.location)) + .filter(model.Polygon.src_id == src_id)) + # sql = q.statement.compile(compile_kwargs={"literal_binds": True}) + + item_count = q.count() + items = [] + for item in q: + keys = ["item_id", "labels", "descriptions", "aliases", "sitelinks", "claims"] + item_dict = {key: getattr(item, key) for key in keys} + items.append(item_dict) + + return {"count": item_count, "items": items} diff --git a/web_view.py b/web_view.py index c8032ad..cadd0ab 100755 --- a/web_view.py +++ b/web_view.py @@ -450,6 +450,15 @@ def api_wikidata_items(): t1 = time() - t0 return cors_jsonify(success=True, duration=t1, **ret) +@app.route("/api/1/place//") +def api_place_items(osm_type, osm_id): + t0 = time() + + ret = api.get_place_items(osm_type, osm_id) + + t1 = time() - t0 + return cors_jsonify(success=True, duration=t1, **ret) + @app.route("/api/1/osm") def api_osm_objects(): From 302da85e33b8e6ce1e54606916d3323667512b5c Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:25:38 +0100 Subject: [PATCH 15/30] Optional isa_filter for wikidata_isa_counts --- matcher/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/matcher/api.py b/matcher/api.py index 5de289e..8b6aee5 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -418,7 +418,7 @@ def wikidata_items_count(bounds, isa_filter=None): return q.count() -def wikidata_isa_counts(bounds): +def wikidata_isa_counts(bounds, isa_filter=None): db_bbox = make_envelope(bounds) q = ( @@ -426,6 +426,9 @@ def wikidata_isa_counts(bounds): .filter(func.ST_Covers(db_bbox, model.ItemLocation.location)) ) + if isa_filter: + q = add_isa_filter(q, isa_filter) + db_items = q.all() counts = get_isa_count(db_items) From 46db0b7401dda0b3b4e259ba6e5f409ad0b11725 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:25:50 +0100 Subject: [PATCH 16/30] Bug fix --- matcher/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matcher/api.py b/matcher/api.py index 8b6aee5..203ccd3 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -825,7 +825,7 @@ def item_detail(item): unsupported = isa_lookup.keys() & unsupported_relation_types if unsupported: d["unsupported_relation_types"] = [isa for isa in d["isa_list"] - if isa.qid in isa_lookup] + if isa["qid"] in isa_lookup] return d From 74788872a361c850a055689c5da28125cdec63fd Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:26:04 +0100 Subject: [PATCH 17/30] Add docstrings --- matcher/api.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/matcher/api.py b/matcher/api.py index 203ccd3..f99228c 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -38,6 +38,14 @@ skip_tags = { } def get_country_iso3166_1(lat, lon): + """ + For a given lat/lon return a set of ISO country codes. + + Also cache the country code in the global object. + + Normally there should be only one country. + """ + point = func.ST_SetSRID(func.ST_MakePoint(lon, lat), srid) alpha2_codes = set() q = model.Polygon.query.filter(func.ST_Covers(model.Polygon.way, point), @@ -57,7 +65,18 @@ def is_street_number_first(lat, lon): return True alpha2 = get_country_iso3166_1(lat, lon) - alpha2_number_first = {'GB', 'IE', 'US', 'MX', 'CA', 'FR', 'AU', 'NZ', 'ZA'} + # Incomplete list of countries that put street number first. + alpha2_number_first = { + 'GB', # United Kingdom + 'IE', # Ireland + 'US', # United States + 'MX', # Mexico + 'CA', # Canada + 'FR', # France + 'AU', # Australia + 'NZ', # New Zealand + 'ZA', # South Africa + } return bool(alpha2_number_first & alpha2) @@ -92,6 +111,7 @@ def make_envelope_around_point(lat, lon, distance): return func.ST_MakeEnvelope(west, south, east, north, srid) def drop_way_area(tags): + """ Remove the way_area field from a tags dict. """ if "way_area" in tags: del tags["way_area"] return tags @@ -122,6 +142,8 @@ def get_part_of(table_name, src_id, bbox): } for osm_id, tags, area in conn.execute(s)] def get_and_save_item(qid): + """ Download an item from Wikidata and cache it in the database. """ + entity = wikidata_api.get_entity(qid) entity_qid = entity["id"] if entity_qid != qid: @@ -739,6 +761,8 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): return nearby def get_item(item_id): + """ Retrieve a Wikidata item, either from the database or from Wikidata. """ + item = model.Item.query.get(item_id) return item or get_and_save_item(f"Q{item_id}") From c95b58fde9d961aa467890ce9828b83285a0fbfa Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:51:02 +0100 Subject: [PATCH 18/30] Reformat with black --- matcher/model.py | 193 ++++++++++++++++++++++++++--------------------- 1 file changed, 106 insertions(+), 87 deletions(-) diff --git a/matcher/model.py b/matcher/model.py index f039a59..ade00d5 100644 --- a/matcher/model.py +++ b/matcher/model.py @@ -20,11 +20,12 @@ import re Base = declarative_base() Base.query = session.query_property() -re_point = re.compile(r'^POINT\((.+) (.+)\)$') +re_point = re.compile(r"^POINT\((.+) (.+)\)$") + +osm_type_enum = postgresql.ENUM( + "node", "way", "relation", name="osm_type_enum", metadata=Base.metadata +) -osm_type_enum = postgresql.ENUM('node', 'way', 'relation', - name='osm_type_enum', - metadata=Base.metadata) class Item(Base): __tablename__ = "item" @@ -35,7 +36,9 @@ class Item(Base): sitelinks = Column(postgresql.JSONB) claims = Column(postgresql.JSONB) lastrevid = Column(Integer, nullable=False, unique=True) - locations = relationship("ItemLocation", cascade="all, delete-orphan", backref="item") + locations = relationship( + "ItemLocation", cascade="all, delete-orphan", backref="item" + ) qid = column_property("Q" + cast(item_id, String)) wiki_extracts = relationship( @@ -56,44 +59,48 @@ class Item(Base): return f"https://www.wikidata.org/wiki/{self.qid}" def get_claim(self, pid): - return [i["mainsnak"]["datavalue"]["value"] if "datavalue" in i["mainsnak"] else None - for i in self.claims.get(pid, [])] + return [ + i["mainsnak"]["datavalue"]["value"] + if "datavalue" in i["mainsnak"] + else None + for i in self.claims.get(pid, []) + ] - def label(self, lang='en'): + def label(self, lang="en"): if lang in self.labels: - return self.labels[lang]['value'] - elif 'en' in self.labels: - return self.labels['en']['value'] + return self.labels[lang]["value"] + elif "en" in self.labels: + return self.labels["en"]["value"] label_list = list(self.labels.values()) - return label_list[0]['value'] if label_list else '[no label]' + return label_list[0]["value"] if label_list else "[no label]" - def description(self, lang='en'): + def description(self, lang="en"): if lang in self.descriptions: - return self.descriptions[lang]['value'] - elif 'en' in self.descriptions: - return self.descriptions['en']['value'] + return self.descriptions[lang]["value"] + elif "en" in self.descriptions: + return self.descriptions["en"]["value"] return d_list = list(self.descriptions.values()) if d_list: - return d_list[0]['value'] + return d_list[0]["value"] - def get_aliases(self, lang='en'): + def get_aliases(self, lang="en"): if lang not in self.aliases: - if 'en' not in self.aliases: + if "en" not in self.aliases: return [] - lang = 'en' - return [a['value'] for a in self.aliases[lang]] + lang = "en" + return [a["value"] for a in self.aliases[lang]] def get_part_of_names(self): if not self.claims: return set() part_of_names = set() - for p361 in self.claims.get('P361', []): + for p361 in self.claims.get("P361", []): try: - part_of_id = p361['mainsnak']['datavalue']['value']['numeric-id'] + part_of_id = p361["mainsnak"]["datavalue"]["value"]["numeric-id"] except KeyError: continue if part_of_id == self.item_id: @@ -108,7 +115,7 @@ class Item(Base): @property def entity(self): - keys = ['labels', 'aliases', 'descriptions', 'sitelinks', 'claims'] + keys = ["labels", "aliases", "descriptions", "sitelinks", "claims"] return {key: getattr(self, key) for key in keys} def names(self, check_part_of=True): @@ -117,22 +124,24 @@ class Item(Base): d = wikidata.names_from_entity(self.entity) or defaultdict(list) for name, sources in list(d.items()): - if len(sources) == 1 and sources[0][0] == 'image': + if len(sources) == 1 and sources[0][0] == "image": continue for part_of_name in part_of_names: if not name.startswith(part_of_name): continue - prefix_removed = name[len(part_of_name):].strip() + prefix_removed = name[len(part_of_name) :].strip() if prefix_removed not in d: d[prefix_removed] = sources if self.claims: - for p6375 in self.claims.get('P6375', []): + for p6375 in self.claims.get("P6375", []): try: - street_address = p6375['mainsnak']['datavalue']['value'] + street_address = p6375["mainsnak"]["datavalue"]["value"] except KeyError: continue - d[street_address['text']].append(('P6375', street_address.get('language'))) + d[street_address["text"]].append( + ("P6375", street_address.get("language")) + ) # A terrace of buildings can be illustrated with a photo of a single building. # We try to determine if this is the case and avoid using the filename of the @@ -141,8 +150,11 @@ class Item(Base): def has_digit(s): return any(c.isdigit() for c in s) - image_names = {name for name, sources in d.items() - if len(sources) == 1 and sources[0][0] == 'image' and has_digit(name)} + image_names = { + name + for name, sources in d.items() + if len(sources) == 1 and sources[0][0] == "image" and has_digit(name) + } if not image_names: return dict(d) or None @@ -176,10 +188,10 @@ class Item(Base): isa_qids = self.get_isa_qids() matching_types = { - "Q12731", # dead end street - "Q34442", # road - "Q79007", # street - "Q83620", # thoroughfare + "Q12731", # dead end street + "Q34442", # road + "Q79007", # street + "Q83620", # thoroughfare "Q21000333", # shopping street "Q62685721", # pedestrian street } @@ -189,14 +201,13 @@ class Item(Base): if isa_qids is None: isa_qids = self.get_isa_qids() matching_types = { - "Q355304", # watercourse - "Q4022", # river - "Q47521", # stream - "Q1437299", # creek + "Q355304", # watercourse + "Q4022", # river + "Q47521", # stream + "Q1437299", # creek "Q63565252", # brook - "Q12284", # canal + "Q12284", # canal "Q55659167", # natural watercourse - } return bool(matching_types & set(isa_qids)) @@ -205,11 +216,16 @@ class Item(Base): return self.is_street(isa_qids) or self.is_watercourse(isa_qids) def is_tram_stop(self): - return 'Q2175765' in self.get_isa_qids() + return "Q2175765" in self.get_isa_qids() def alert_admin_about_bad_time(self, v): - body = ("Wikidata item has an unsupported time precision\n\n" - + self.wd_url + "\n\n" + "Value:\n\n" + json.dumps(v, indent=2)) + body = ( + "Wikidata item has an unsupported time precision\n\n" + + self.wd_url + + "\n\n" + + "Value:\n\n" + + json.dumps(v, indent=2) + ) mail.send_mail(f"OWL Map: bad time value in {self.qid}", body) def time_claim(self, pid): @@ -271,6 +287,7 @@ class Item(Base): return text[: first_end_p_tag + len(close_tag)] + # class Claim(Base): # __tablename__ = "claim" # item_id = Column(Integer, primary_key=True) @@ -278,13 +295,14 @@ class Item(Base): # position = Column(Integer, primary_key=True) # mainsnak = Column(postgresql.JSONB) -class ItemIsA(Base): - __tablename__ = 'item_isa' - item_id = Column(Integer, ForeignKey('item.item_id'), primary_key=True) - isa_id = Column(Integer, ForeignKey('item.item_id'), primary_key=True) - item = relationship('Item', foreign_keys=[item_id]) - isa = relationship('Item', foreign_keys=[isa_id]) +class ItemIsA(Base): + __tablename__ = "item_isa" + item_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) + isa_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) + + item = relationship("Item", foreign_keys=[item_id]) + isa = relationship("Item", foreign_keys=[isa_id]) class ItemLocation(Base): @@ -298,17 +316,17 @@ class ItemLocation(Base): pid = column_property("P" + cast(item_id, String)) def get_lat_lon(self): - return session.query(func.ST_Y(self.location), - func.ST_X(self.location)).one() + return session.query(func.ST_Y(self.location), func.ST_X(self.location)).one() + def location_objects(coords): locations = [] for pid, coord_list in coords.items(): for num, coords in enumerate(coord_list): point = f"POINT({coords['longitude']} {coords['latitude']})" - loc = ItemLocation(property_id=int(pid[1:]), - statement_order=num, - location=point) + loc = ItemLocation( + property_id=int(pid[1:]), statement_order=num, location=point + ) locations.append(loc) return locations @@ -338,8 +356,7 @@ class MapMixin: @declared_attr def geojson_str(cls): return column_property( - func.ST_AsGeoJSON(cls.way, maxdecimaldigits=6), - deferred=True + func.ST_AsGeoJSON(cls.way, maxdecimaldigits=6), deferred=True ) @declared_attr @@ -348,17 +365,16 @@ class MapMixin: @hybrid_property def has_street_address(self): - return ("addr:housenumber" in self.tags - and "addr:street" in self.tags) + return "addr:housenumber" in self.tags and "addr:street" in self.tags def display_name(self): - for key in 'bridge:name', 'tunnel:name', 'lock_name': + for key in "bridge:name", "tunnel:name", "lock_name": if key in self.tags: return self.tags[key] - return (self.name - or self.tags.get("addr:housename") - or self.tags.get("inscription")) + return ( + self.name or self.tags.get("addr:housename") or self.tags.get("inscription") + ) def geojson(self): return json.loads(self.geojson_str) @@ -399,7 +415,7 @@ class Line(MapMixin, Base): @classmethod def get_osm(cls, osm_type, osm_id): - src_id = osm_id * {'way': 1, 'relation': -1}[osm_type] + src_id = osm_id * {"way": 1, "relation": -1}[osm_type] return cls.query.get(src_id) @@ -408,7 +424,7 @@ class Polygon(MapMixin, Base): @classmethod def get_osm(cls, osm_type, osm_id): - src_id = osm_id * {'way': 1, 'relation': -1}[osm_type] + src_id = osm_id * {"way": 1, "relation": -1}[osm_type] return cls.query.get(src_id) @property @@ -425,7 +441,7 @@ class Polygon(MapMixin, Base): class User(Base, UserMixin): - __tablename__ = 'user' + __tablename__ = "user" id = Column(Integer, primary_key=True) username = Column(String) password = Column(String) @@ -451,20 +467,21 @@ class User(Base, UserMixin): def is_active(self): return self.active + class EditSession(Base): - __tablename__ = 'edit_session' + __tablename__ = "edit_session" id = Column(Integer, primary_key=True) user_id = Column(Integer, ForeignKey(User.id)) created = Column(DateTime, default=now_utc(), nullable=False) edit_list = Column(postgresql.JSONB) comment = Column(String) - user = relationship('User') - changeset = relationship('Changeset', back_populates='edit_session', uselist=False) + user = relationship("User") + changeset = relationship("Changeset", back_populates="edit_session", uselist=False) class Changeset(Base): - __tablename__ = 'changeset' + __tablename__ = "changeset" id = Column(BigInteger, primary_key=True) created = Column(DateTime) comment = Column(String) @@ -472,41 +489,43 @@ class Changeset(Base): update_count = Column(Integer, nullable=False) edit_session_id = Column(Integer, ForeignKey(EditSession.id)) - user = relationship('User', - backref=backref('changesets', - lazy='dynamic', - order_by='Changeset.created.desc()')) + user = relationship( + "User", + backref=backref( + "changesets", lazy="dynamic", order_by="Changeset.created.desc()" + ), + ) - edit_session = relationship('EditSession', back_populates='changeset') + edit_session = relationship("EditSession", back_populates="changeset") class ChangesetEdit(Base): - __tablename__ = 'changeset_edit' + __tablename__ = "changeset_edit" - changeset_id = Column(BigInteger, - ForeignKey('changeset.id'), - primary_key=True) + changeset_id = Column(BigInteger, ForeignKey("changeset.id"), primary_key=True) item_id = Column(Integer, primary_key=True) osm_id = Column(BigInteger, primary_key=True) osm_type = Column(osm_type_enum, primary_key=True) saved = Column(DateTime, default=now_utc(), nullable=False) - changeset = relationship('Changeset', - backref=backref('edits', lazy='dynamic')) + changeset = relationship("Changeset", backref=backref("edits", lazy="dynamic")) + class SkipIsA(Base): - __tablename__ = 'skip_isa' - item_id = Column(Integer, ForeignKey('item.item_id'), primary_key=True) + __tablename__ = "skip_isa" + item_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) + + item = relationship("Item") - item = relationship('Item') class ItemExtraKeys(Base): - __tablename__ = 'item_extra_keys' - item_id = Column(Integer, ForeignKey('item.item_id'), primary_key=True) + __tablename__ = "item_extra_keys" + item_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) tag_or_key = Column(String, primary_key=True) note = Column(String) - item = relationship('Item') + item = relationship("Item") + class Extract(Base): __tablename__ = "extract" From 90b83e93ad7bc2ae43ae52fbf20cfb5e1aa57964 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:51:44 +0100 Subject: [PATCH 19/30] Add methods to get item identifiers --- matcher/model.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/matcher/model.py b/matcher/model.py index ade00d5..7ee02f4 100644 --- a/matcher/model.py +++ b/matcher/model.py @@ -26,6 +26,43 @@ osm_type_enum = postgresql.ENUM( "node", "way", "relation", name="osm_type_enum", metadata=Base.metadata ) +re_lau_code = re.compile(r"^[A-Z]{2}([^A-Z].+)$") # 'LAU (local administrative unit)' + +property_map = [ + ("P238", ["iata"], "IATA airport code"), + ("P239", ["icao"], "ICAO airport code"), + ("P240", ["faa", "ref"], "FAA airport code"), + ("P296", ["ref", "ref:train", "railway:ref"], "station code"), + ("P300", ["ISO3166-2"], "ISO 3166-2 code"), + ("P359", ["ref:rce"], "Rijksmonument ID"), + ("P590", ["ref:gnis", "GNISID", "gnis:id", "gnis:feature_id"], "USGS GNIS ID"), + ("P649", ["ref:nrhp"], "NRHP reference number"), + ("P722", ["uic_ref"], "UIC station code"), + ("P782", ["ref"], "LAU (local administrative unit)"), + ("P836", ["ref:gss"], "UK Government Statistical Service code"), + ("P856", ["website", "contact:website", "url"], "website"), + ("P882", ["nist:fips_code"], "FIPS 6-4 (US counties)"), + ("P901", ["ref:fips"], "FIPS 10-4 (countries and regions)"), + # A UIC id can be a IBNR, but not every IBNR is an UIC id + ("P954", ["uic_ref"], "IBNR ID"), + ("P981", ["ref:woonplaatscode"], "BAG code for Dutch residencies"), + ("P1216", ["HE_ref"], "National Heritage List for England number"), + ("P2253", ["ref:edubase"], "EDUBase URN"), + ("P2815", ["esr:user", "ref", "ref:train"], "ESR station code"), + ("P3425", ["ref", "ref:SIC"], "Natura 2000 site ID"), + ("P3562", ["seamark:light:reference"], "Admiralty number"), + ( + "P4755", + ["ref", "ref:train", "ref:crs", "crs", "nat_ref"], + "UK railway station code", + ), + ("P4803", ["ref", "ref:train"], "Amtrak station code"), + ("P6082", ["nycdoitt:bin"], "NYC Building Identification Number"), + ("P5086", ["ref"], "FIPS 5-2 alpha code (US states)"), + ("P5087", ["ref:fips"], "FIPS 5-2 numeric code (US states)"), + ("P5208", ["ref:bag"], "BAG building ID for Dutch buildings"), +] + class Item(Base): __tablename__ = "item" @@ -287,6 +324,42 @@ class Item(Base): return text[: first_end_p_tag + len(close_tag)] + def get_identifiers_tags(self): + tags = defaultdict(list) + for claim, osm_keys, label in property_map: + values = [ + i["mainsnak"]["datavalue"]["value"] + for i in self.claims.get(claim, []) + if "datavalue" in i["mainsnak"] + ] + if not values: + continue + if claim == "P782": + values += [ + m.group(1) for m in (re_lau_code.match(v) for v in values) if m + ] + for osm_key in osm_keys: + tags[osm_key].append((values, label)) + return dict(tags) + + def get_identifiers(self): + ret = {} + for claim, osm_keys, label in property_map: + values = [ + i["mainsnak"]["datavalue"]["value"] + for i in self.claims.get(claim, []) + if "datavalue" in i["mainsnak"] + ] + if not values: + continue + if claim == "P782": + values += [ + m.group(1) for m in (re_lau_code.match(v) for v in values) if m + ] + for osm_key in osm_keys: + ret[label] = values + return ret + # class Claim(Base): # __tablename__ = "claim" From 6d9c7fb1bcea228b90f24387f6f638e90c5bedd0 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 18 Apr 2022 12:52:11 +0100 Subject: [PATCH 20/30] Include identifiers in item details API call --- matcher/api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/matcher/api.py b/matcher/api.py index f99228c..e2beb7c 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -838,6 +838,7 @@ def item_detail(item): "p576": item.time_claim("P576"), "heritage_designation": heritage_designation, "wikipedia": wikipedia_links, + "identifiers": item.get_identifiers(), } if aliases := item.get_aliases(): From dfb282b4689c4a9af0d10b044709ce2223bd5be4 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 10:31:56 +0000 Subject: [PATCH 21/30] Don't show debugging output. --- matcher/api.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/matcher/api.py b/matcher/api.py index 2f8adda..d47dd69 100644 --- a/matcher/api.py +++ b/matcher/api.py @@ -264,7 +264,7 @@ WHERE tags ? 'wikidata' conn = database.session.connection() result = conn.execute(text(sql)) - print(sql) + # print(sql) point_sql = ( f""" @@ -789,7 +789,7 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): if limit: s = s.limit(limit) - print(s.compile(compile_kwargs={"literal_binds": True})) + # print(s.compile(compile_kwargs={"literal_binds": True})) conn = database.session.connection() nearby = [] @@ -852,6 +852,7 @@ def find_osm_candidates(item, limit=80, max_distance=450, names=None): def get_item(item_id): """Retrieve a Wikidata item, either from the database or from Wikidata.""" + item = model.Item.query.get(item_id) return item or get_and_save_item(f"Q{item_id}") @@ -1031,7 +1032,7 @@ def isa_incremental_search(search_terms): func.length(en_label) < 20, ) - print(q.statement.compile(compile_kwargs={"literal_binds": True})) + # print(q.statement.compile(compile_kwargs={"literal_binds": True})) ret = [] for item in q: From d1e5a3122598b91879c69135f7930df90950f3b4 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 10:33:18 +0000 Subject: [PATCH 22/30] Update version in User-Agent header --- matcher/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/matcher/__init__.py b/matcher/__init__.py index 5e9706c..80c2c41 100644 --- a/matcher/__init__.py +++ b/matcher/__init__.py @@ -4,7 +4,9 @@ CallParams = dict[str, str | int] user_agent = ( - "osm-wikidata/0.1 (https://github.com/EdwardBetts/osm-wikidata; edward@4angle.com)" + "osm-wikidata/0.2" + + " (https://github.com/EdwardBetts/osm-wikidata;" + + " edward@4angle.com)" ) From f14cb36896f2a65d8a1308cdc87734429622a229 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 10:37:14 +0000 Subject: [PATCH 23/30] Bug fix --- matcher/mail.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/matcher/mail.py b/matcher/mail.py index 23dd0ac..cc5fd91 100644 --- a/matcher/mail.py +++ b/matcher/mail.py @@ -1,10 +1,11 @@ -from flask import current_app, g, request, has_request_context +import smtplib +import sys +import traceback from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from pprint import pformat -import smtplib -import traceback -import sys + +from flask import current_app, g, has_request_context, request def send_mail(subject, body, config=None): @@ -71,7 +72,7 @@ def open_changeset_error(session_id, changeset, r): username = g.user.username body = f""" user: {username} -page: {url} +page: {r.url} message user: https://www.openstreetmap.org/message/new/{username} From 503280cfc15f253f9793936bcbffed5eaaf3e961 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 10:40:59 +0000 Subject: [PATCH 24/30] Add docstrings and types --- matcher/model.py | 119 +++++++++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 41 deletions(-) diff --git a/matcher/model.py b/matcher/model.py index 7ee02f4..4c65671 100644 --- a/matcher/model.py +++ b/matcher/model.py @@ -1,21 +1,24 @@ -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.schema import ForeignKey, Column -from sqlalchemy.orm import relationship, column_property, deferred, backref -from sqlalchemy import func -from sqlalchemy.types import Integer, String, Float, Boolean, DateTime, Text, BigInteger -from sqlalchemy.dialects import postgresql -from sqlalchemy.sql.expression import cast -from sqlalchemy.ext.hybrid import hybrid_property -from sqlalchemy.ext.declarative import declared_attr -from sqlalchemy.ext.associationproxy import association_proxy -from sqlalchemy.orm.collections import attribute_mapped_collection -from geoalchemy2 import Geometry -from collections import defaultdict -from flask_login import UserMixin -from .database import session, now_utc -from . import wikidata, utils, mail import json import re +import typing +from collections import defaultdict +from typing import Any + +from flask_login import UserMixin +from geoalchemy2 import Geometry +from sqlalchemy import func +from sqlalchemy.dialects import postgresql +from sqlalchemy.ext.associationproxy import association_proxy +from sqlalchemy.ext.declarative import declarative_base, declared_attr +from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy.orm import backref, column_property, deferred, relationship +from sqlalchemy.orm.collections import attribute_mapped_collection +from sqlalchemy.schema import Column, ForeignKey +from sqlalchemy.sql.expression import cast +from sqlalchemy.types import BigInteger, Boolean, DateTime, Float, Integer, String, Text + +from . import mail, utils, wikidata +from .database import now_utc, session Base = declarative_base() Base.query = session.query_property() @@ -63,15 +66,19 @@ property_map = [ ("P5208", ["ref:bag"], "BAG building ID for Dutch buildings"), ] +T = typing.TypeVar("T", bound="Item") + class Item(Base): + """Wikidata item.""" + __tablename__ = "item" item_id = Column(Integer, primary_key=True, autoincrement=False) labels = Column(postgresql.JSONB) descriptions = Column(postgresql.JSONB) aliases = Column(postgresql.JSONB) sitelinks = Column(postgresql.JSONB) - claims = Column(postgresql.JSONB) + claims = Column(postgresql.JSONB, nullable=False) lastrevid = Column(Integer, nullable=False, unique=True) locations = relationship( "ItemLocation", cascade="all, delete-orphan", backref="item" @@ -87,37 +94,46 @@ class Item(Base): extracts = association_proxy("wiki_extracts", "extract") @classmethod - def get_by_qid(cls, qid): + def get_by_qid(cls: typing.Type[T], qid: str) -> T | None: if qid and len(qid) > 1 and qid[0].upper() == "Q" and qid[1:].isdigit(): - return cls.query.get(qid[1:]) + obj: T = cls.query.get(qid[1:]) + return obj + return None @property - def wd_url(self): + def wd_url(self) -> str: + """Wikidata URL for item.""" return f"https://www.wikidata.org/wiki/{self.qid}" - def get_claim(self, pid): + def get_claim(self, pid: str) -> list[dict[str, Any] | None]: + """List of claims for given Wikidata property ID.""" + claims = typing.cast(dict[str, list[dict[str, Any]]], self.claims) return [ i["mainsnak"]["datavalue"]["value"] if "datavalue" in i["mainsnak"] else None - for i in self.claims.get(pid, []) + for i in claims.get(pid, []) ] - def label(self, lang="en"): - if lang in self.labels: - return self.labels[lang]["value"] - elif "en" in self.labels: - return self.labels["en"]["value"] + def label(self, lang: str = "en") -> str: + """Label for this Wikidata item.""" + labels = typing.cast(dict[str, dict[str, Any]], self.labels) + if lang in labels: + return typing.cast(str, labels[lang]["value"]) + elif "en" in labels: + return typing.cast(str, labels["en"]["value"]) - label_list = list(self.labels.values()) - return label_list[0]["value"] if label_list else "[no label]" + label_list = list(labels.values()) + return typing.cast(str, label_list[0]["value"]) if label_list else "[no label]" - def description(self, lang="en"): - if lang in self.descriptions: - return self.descriptions[lang]["value"] - elif "en" in self.descriptions: - return self.descriptions["en"]["value"] - return + def description(self, lang: str = "en") -> str | None: + """Return a description of the item.""" + descriptions = typing.cast(dict[str, dict[str, Any]], self.descriptions) + if lang in descriptions: + return typing.cast(str, descriptions[lang]["value"]) + elif "en" in descriptions: + return typing.cast(str, descriptions["en"]["value"]) + return None d_list = list(self.descriptions.values()) if d_list: @@ -388,8 +404,11 @@ class ItemLocation(Base): qid = column_property("Q" + cast(item_id, String)) pid = column_property("P" + cast(item_id, String)) - def get_lat_lon(self): - return session.query(func.ST_Y(self.location), func.ST_X(self.location)).one() + def get_lat_lon(self) -> tuple[float, float]: + """Get latitude and longitude of item.""" + loc: tuple[float, float] + loc = session.query(func.ST_Y(self.location), func.ST_X(self.location)).one() + return loc def location_objects(coords): @@ -501,7 +520,8 @@ class Polygon(MapMixin, Base): return cls.query.get(src_id) @property - def type(self): + def type(self) -> str: + """Polygon is either a way or a relation.""" return "way" if self.src_id > 0 else "relation" @declared_attr @@ -509,11 +529,14 @@ class Polygon(MapMixin, Base): return column_property(func.ST_Area(cls.way, False), deferred=True) @hybrid_property - def area_in_sq_km(self): + def area_in_sq_km(self) -> float: + """Size of area in square km.""" return self.area / (1000 * 1000) class User(Base, UserMixin): + """User.""" + __tablename__ = "user" id = Column(Integer, primary_key=True) username = Column(String) @@ -537,7 +560,8 @@ class User(Base, UserMixin): osm_oauth_token = Column(String) osm_oauth_token_secret = Column(String) - def is_active(self): + def is_active(self) -> bool: + """User is active.""" return self.active @@ -554,6 +578,8 @@ class EditSession(Base): class Changeset(Base): + """An OSM Changeset generated by this tool.""" + __tablename__ = "changeset" id = Column(BigInteger, primary_key=True) created = Column(DateTime) @@ -573,6 +599,8 @@ class Changeset(Base): class ChangesetEdit(Base): + """Record details of edits within a changeset.""" + __tablename__ = "changeset_edit" changeset_id = Column(BigInteger, ForeignKey("changeset.id"), primary_key=True) @@ -585,28 +613,37 @@ class ChangesetEdit(Base): class SkipIsA(Base): + """Ignore this item type when walking the Wikidata subclass graph.""" + __tablename__ = "skip_isa" item_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) + qid = column_property("Q" + cast(item_id, String)) item = relationship("Item") class ItemExtraKeys(Base): + """Extra tag or key to consider for an Wikidata item type.""" + __tablename__ = "item_extra_keys" item_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) tag_or_key = Column(String, primary_key=True) note = Column(String) + qid = column_property("Q" + cast(item_id, String)) item = relationship("Item") class Extract(Base): + """First paragraph from Wikipedia.""" + __tablename__ = "extract" item_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) site = Column(String, primary_key=True) extract = Column(String, nullable=False) - def __init__(self, site, extract): + def __init__(self, site: str, extract: str): + """Initialise the object.""" self.site = site self.extract = extract From ba37fae51ad4778fe1640cbde3ea4de6c9820051 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 10:44:41 +0000 Subject: [PATCH 25/30] Add the Wikidata recent changes updater Download Wikidata recent changes and update items in local database. --- update.py | 249 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100755 update.py diff --git a/update.py b/update.py new file mode 100755 index 0000000..038b736 --- /dev/null +++ b/update.py @@ -0,0 +1,249 @@ +#!/usr/bin/python3 + +"""Download Wikidata recent changes and update items in local database.""" + +import json +import os +import sys +from time import sleep + +from matcher import database, model, utils, wikidata, wikidata_api + +DB_URL = "postgresql:///matcher" +database.init_db(DB_URL) + +previous_max_lastrevid = 1388804050 # Q106152661 + +entity_keys = {"labels", "sitelinks", "aliases", "claims", "descriptions", "lastrevid"} + + +def read_changes(): + qids = set() + max_lastrevid = 0 + for f in sorted(os.listdir("changes"), key=lambda f: int(f.partition(".")[0])): + reply = json.load(open("changes/" + f)) + print(f, len(qids)) + for change in reply["query"]["recentchanges"]: + # rctype = change["type"] + title = change["title"] + revid = change["revid"] + if revid and revid > max_lastrevid: + max_lastrevid = revid + assert title.startswith("Q") + qids.add(title) + print(len(qids)) + print(max_lastrevid) + + return + + for cur in utils.chunk(qids, 50): + print(cur) + for qid, entity in wikidata_api.get_entities(cur): + with open(f"items/{qid}.json", "w") as out: + json.dump(entity, out) + + +def get_changes(): + start = "2021-03-24T11:56:11" + rccontinue = None + i = 0 + while True: + i += 1 + r = wikidata_api.query_wd_api(rcstart=start, rccontinue=rccontinue) + with open(f"changes/{i:06d}.json", "w") as out: + out.write(r.text) + + reply = r.json() + try: + print(reply["query"]["recentchanges"][0]["timestamp"]) + except KeyError: + print("KeyError") + + if False: + for change in reply["query"]["recentchanges"]: + # rctype = change["type"] + # if change["revid"] == 0 and change["old_revid"] == 0: + # continue + + if change["logtype"] == "delete" and change["logaction"] in { + "revision", + "delete", + "restore", + }: + continue + + if change["logtype"] == "protect" and change["logaction"] in { + "unprotect", + "protect", + }: + continue + + print(json.dumps(change, indent=2)) + sys.exit(0) + + continue + + if not change["title"].startswith("Q"): + continue # not an item + + qid = change["title"] + assert qid[1:].isdigit() + item_id = int(qid[1:]) + revid = change["revid"] + + item = model.Item.query.get(item_id) + if change["type"] == "edit" and not item: + continue + + if change["type"] == "new" and not item: + print(("new", qid)) + continue + + if not item: + print(qid) + print(json.dumps(change, indent=2)) + print((change["type"], qid, item.lastrevid, revid)) + + # print(json.dumps(reply, indent=2)) + + if "continue" not in reply: + break + + rccontinue = reply["continue"]["rccontinue"] + print(rccontinue) + sleep(1) + + +def get_timestamp(): + ts = wikidata_api.get_revision_timestamp(previous_max_lastrevid) + print(ts) + + +def handle_new(change): + qid = change["title"] + ts = change["timestamp"] + if change["redirect"]: + print(f"{ts}: new item {qid}, since replaced with redirect") + return + item = model.Item.query.get(qid[1:]) # check if item is already loaded + if item: + return handle_edit(change) + + entity = wikidata_api.get_entity(qid) + if entity["id"] != qid: + print(f'redirect {qid} -> {entity["id"]}') + return + + if "claims" not in entity: + print(qid) + print(entity) + coords = wikidata.get_entity_coords(entity["claims"]) + if not coords: + print(f"{ts}: new item {qid} without coordinates") + return + + print(f"{ts}: new item {qid} with coordinates") + + item_id = int(qid[1:]) + obj = {k: v for k, v in entity.items() if k in entity_keys} + try: + item = model.Item(item_id=item_id, **obj) + except TypeError: + print(qid) + print(f'{entity["pageid"]=} {entity["ns"]=} {entity["type"]=}') + print(entity.keys()) + raise + item.locations = model.location_objects(coords) + database.session.add(item) + + +def coords_equal(a, b): + """Deep equality comparison of nested dicts.""" + return json.dumps(a, sort_keys=True) == json.dumps(b, sort_keys=True) + + +def handle_edit(change): + qid = change["title"] + item = model.Item.query.get(qid[1:]) + if not item: + return # item isn't in our database so it probably has no coordinates + + ts = change["timestamp"] + + if item.lastrevid >= change["revid"]: + print(f"{ts}: no need to update {qid}") + return + + entity = wikidata_api.get_entity(qid) + entity_qid = entity.pop("id") + if entity_qid != qid: + print(f"{ts}: item {qid} replaced with redirect") + database.session.delete(item) + database.session.commit() + return + + assert entity_qid == qid + existing_coords = wikidata.get_entity_coords(item.claims) + if "claims" not in entity: + return + coords = wikidata.get_entity_coords(entity["claims"]) + + if not coords_equal(existing_coords, coords): + print(f"{ts}: update item {qid}, including coordinates") + item.locations = model.location_objects(coords) + else: + print(f"{ts}: update item {qid}, no change to coordinates") + + for key in entity_keys: + setattr(item, key, entity[key]) + + +def update_timestamp(timestamp): + out = open("rc_timestamp", "w") + print(timestamp, file=out) + out.close() + + +def update_database(): + with open("rc_timestamp") as f: + start = f.read().strip() + + rccontinue = None + seen = set() + while True: + r = wikidata_api.get_recent_changes(rcstart=start, rccontinue=rccontinue) + + reply = r.json() + for change in reply["query"]["recentchanges"]: + rctype = change["type"] + timestamp = change["timestamp"] + qid = change["title"] + if qid in seen: + continue + + if rctype == "new": + handle_new(change) + seen.add(qid) + if rctype == "edit": + handle_edit(change) + seen.add(qid) + + update_timestamp(timestamp) + print("commit") + database.session.commit() + + if "continue" not in reply: + break + + rccontinue = reply["continue"]["rccontinue"] + database.session.commit() + print("finished") + + +# read_changes() +# get_timestamp() +# get_changes() + +while True: + update_database() + sleep(60) From 55b5bfdc82e229511f9c897e12e5f272cdb9ebd9 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 11:11:47 +0000 Subject: [PATCH 26/30] Add development notes. --- notes | 406 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 406 insertions(+) create mode 100644 notes diff --git a/notes b/notes new file mode 100644 index 0000000..a97e95e --- /dev/null +++ b/notes @@ -0,0 +1,406 @@ +# vim: spell:tw=80 ft=markdown + +Extracted items from data dump that include "P625" with the quotes. There are 8,398,490 matching items. + +Nearest-Neighbour Searching + +https://postgis.net/workshops/postgis-intro/knn.html + +--- +Use recent changes API to update local Wikidata entity mirror. + +Need to handle new item, edit, delete, and undelete. + +For now we're just interested in items with coordinates, later we might care +about languages, and classes. + +At some point we might keep track of redirects. + +Deletes +------- +Is the item in our database? If not then ignore it, if yes then delete it. + +New +--- +Download full entity, check if it contains coordinates, if yes, then add to database, if not then ignore. + +Make a note of item ID and revid. Avoid downloading item again during update. + +Edits +----- +If the item is in our database and lastrevid is larger than the revid of the change then skip. + +Download full entity. + +If in our database and latest revision includes coordinates update item in +database. If no coordinates then delete from our database. + + +====== +Currently we have geographic objects represented by the Item class. We also want +information about the type of object, languages and countries. + +How about a hierarchy with Item as the base class and GeoItem as a subclass for +geographical objects. We can also have IsA, Language, and Country classes that +derive from Item. + +Countries are a subclass of GeoItem. + +With the current design the Item table represents a cached copy of the latest +version of the Wikidata item, no history is stored locally. This makes it had to +keep track of changes over time. + +The same is true of the OSM data, we just keeping a copy of the most recent +version. + +Instead we could store multiple revisions of Wikidata items. We want the latest +version and any that has been considered part of a match with OSM. + +Which Wikidata revisions do we keep? + +1. latest revision +2. revision used to generate match +3. revision used in match checked by user + +Maybe a separate item revision table is too complex. We could just store JSON +from a match in a table of OSM user uploads. + +=== +All countries have a P625 statement + +=== +cable-stayed bridge (Q158555) + +There are 786 bridges on OSM tagged with bridge:structure=cable-stayed. Some of +these have a Wikidata tag but aren't tagged as a cable-stayed bridge in +Wikidata. The Wikidata could be updated to tag them as a cable-stayed bridge. +Something similar could be applied to other types. + +=== +Lots of items with coordinates don\'t have OSM tags/keys, either because they +don\'t belong on the map or there isn\'t enough info in Wikidata. + +Need to search different properties for OSM tags, at least 'instance of', +'use', 'sport' and 'religion'. + +Should start from items with an OSM tag first. Download all items with OSM tag, +then walk subclass tree and download. + +=== +Test out a new design. + +=== +Make a status page that shows the size of each database table. +=== +What should URLs look like, say I want to show lakes in lapland? + +https://osm.wikidata.link/matches?isa=Lake&location=Lapland + +=== +OSM & Wikidata pin map TODO list + +IsA list should support filtering + +=== +2021-06-17 + +Candidate list should show street address. For example: + +https://alpha.osm.wikidata.link/map/17/40.01461/-105.28196?item=Q42384818 +--- +Preset could be more specific. For example mosque instead of place of worship. + +id-tagging-schema/data/presets/amenity/place_of_worship/muslim.json +--- +candidates list should show object tags + +=== +2021-06-19 + +* Rename from 'alpha' to 'v2'. +* Use Flask-Babel to for i18n. Get translations from + https://www.microsoft.com/en-us/language/ +* Show total number of items +* Show place name +* Show aliases + +=== +2021-06-23 + +Planning to update user IP location code. Should grab items within city or +region. Need to handle IP that only resolves to a whole country. For example +archive.org is 207.241.224.2, and just returns USA. The USA is too big to apply +the matcher interface to. + +When trying to match the whole USA we should show the whole country and +encourage the user to zoom in. Once the + +--- +Map thoughts. Questions: + +What do we show to the user when the site loads? +What happens when the user drags the map? +What happens when the user changes zoom? +How does searching change things? + +Starting scenarios: + +User enters IP that resolves to a big country, say USA. We show a map of the +whole USA and ask them to zoom in. Once they've zoomed in we can show the total +number of items and the item type facets. + +Find item type within Cambridge: + +``` SQL +select jsonb_path_query(claims, '$.P31[*].mainsnak.datavalue.value.id') as isa, count(*) as num +from item, item_location, planet_osm_polygon +where item.item_id = item_location.item_id and osm_id=-295355 and ST_Covers(way, location) group by isa order by num; +``` + +Also need a to show a facet for items where item type is empty + +Find item type within California: +``` SQL +select jsonb_path_query(claims, '$.P31[*].mainsnak.datavalue.value.id') as isa, count(*) as num +from item, item_location, planet_osm_polygon +where item.item_id = item_location.item_id and osm_id=-165475 and ST_Intersects(way, location) +group by isa order by num desc limit 20; +``` +This query takes 26.5 seconds. + +England item count takes 1.5 seconds. + +``` SQL +select count(distinct item_id) +from item_location, planet_osm_polygon +where osm_id=-58447 and ST_Covers(way, location); +``` + +=== +2021-06-25 + +Library buildings (Q856584) in England. Query takes 3 seconds + +``` SQL +select count(*) +from item, item_location, planet_osm_polygon as loc +where loc.osm_id=-58447 + and jsonb_path_query_array(claims, '$.P31[*].mainsnak.datavalue.value.id') ? 'Q856584' + and item.item_id = item_location.item_id + and item_location.location && loc.way; +``` +=== +2021-07-04 + +TODO +* Better error page than just 500 Internal Server Error. +* Improve handling of Wikidata items without coordinates. Use different colour + for OSM Pin. Explain situation on item detail page. No need to look for matches. +* DONE: Show spinner when looking for nearby OSM candidate matches. +* DONE: Show message if no matches found. +* Add 'building only match' switch +* Two item pins on top of each other is a problem. + +2021-07-05 + +Sometimes the selected OSM matches are incorrect. For example: + +https://v2.osm.wikidata.link/map/15/37.31390/-121.86338?item=Q83645632 + +The item is linked to a node, a way and a relation. The node shows as a pin on +the map, but isn't in the list of possible nearby matches. The way and relation +both show in the list, but aren't selected. + +2021-07-07 + +Logout link should come back to the same map location. Need to record the +location somewhere. Could be in a cookie, constant updating of the logout +URL, or have JavaScript that runs when the user follows the logout link. + +Search +Should show a spinner so the user knows something is happening. +Trigger search after first three characters have been entered. +DONE: Style search hits so not so close to search box + +Highlight chosen search result. +Close button to hide search results. +DONE: Zoom to result bounds instead of zoom level 16. +Should you be allowed to search while editing? + +DONE: Hide OSM candidate checkboxes if user not logged in. + +2021-07-10 + +Exclude ways that are part of a boundary. Example: + +https://v2.osm.wikidata.link/map/18/42.37903/-71.11136?item=Q14715848 + +2021-07-16 + +Need better handling for OSM with wikidata tag but item has no coordinates. + +Viewing a street shows too many yellow pins. +https://v2.osm.wikidata.link/map/15/37.31221/-121.88869?item=Q89545422 + +2021-07-17 +Could match on just name +https://v2.osm.wikidata.link/map/18/50.21789/-5.28079?item=Q5288904 + +2021-07-18 +Florida State Road 922 (Q2433226) is stored as multiple lines in the osm2pgsql +database. Need to rebuild the database with the --multi-geometry so there is +only one. + +2021-07-19 +After a save clicking on another item without closing edit panel causes +problems. Need to trigger close_edit_list when opening item if upload_state is +set to 'done' + +2021-07-22 + +Example of a long road: Collins Avenue (Q652775) +https://v2.osm.wikidata.link/map/19/25.86222/-80.12032?item=Q652775 + +2021-08-04 +Use https://vue-select.org/ for item type filter. +Show alert with spinner while count is running. +Maybe we want to supply the item type filter as JSON and filter in the browser, +no need to hit the server and database. +Write documentation for the API. +Speed up the item detail OSM nearby option. +Use the sidebar to show list of items in the current view, so the user can +go through the list and check them. +OSM object polygon size is broken + +2021-08-05 + +IsA search + +```sql +SELECT 'Q' || item.item_id, item.labels->'en'->>'value' FROM item WHERE +item.claims ? 'P1282' AND lower(jsonb_extract_path_text(item.labels, 'en', +'value')) LIKE lower('%hotel%') AND length(jsonb_extract_path_text(item.labels, +'en', 'value')) < 20; +``` + +2021-09-11 + +Notes from Pixel 2 + +Pin at the centroid of a polygon is to busy, especially with an item that links +to multiple OSM objects. Object outline already on map, just need to connect +outline to Wikidata markers. Could try and work out corners of rectangular +buildings. Should link to ends nearest node for linear objects. + +Show warning when navigating away from map with edits. + +See WindowEventHandlers.onbeforeunload + +Option to clear edit list. + +--- +Ignore coordinates with a Google Maps reference. Example: + +https://www.wikidata.org/w/index.php?title=Q66228733&oldid=992964237 + +--- +Check history for previous wikidata tags to warn mappers if a wikidata tag +they're adding has previously been removed. + +Examples: + https://v2.osm.wikidata.link/map/17/52.18211/0.17756?item=Q6717455 + and https://www.openstreetmap.org/way/143741201 + https://www.openstreetmap.org/way/684624781 + +--- +What happens when we moved the map? + +First we check the area visible on the map. If it is too large then there is +nothing we can do, we give up and tell the user they need to zoom in. + +Otherwise we send the server a request for a count of the number of items in the +current view. If the count is too high we abort and tell the user to zoom in. + +Once we know the area isn't too big and doesn't have too many items we want to +make three requests to the server. First we make requests for the Wikidata items +on the map another request for OSM objects with a Wikidata tag on the map. Both +requests run at the same time. Once both requests complete we make another +request to check for missing Wikidata items that were linked from OSM objects. + +--- +This is done + +https://v2.osm.wikidata.link/map/18/52.23270/0.21560?item=Q55099320 +should match: https://www.openstreetmap.org/node/2000849525 + +Look for Tag:abandoned:railway=station + +--- +Need better handling for Wikidata redirects. + +Example: https://www.openstreetmap.org/way/130458959 +https://v2.osm.wikidata.link/map/18/51.36973/-2.81079?item=Q5117357 + +--- +Consider 'OS grid reference' +https://www.wikidata.org/w/index.php?title=Q27082051&oldid=1336630735 + +--- +Check for OpenStreetMap relation ID (P402) in Wikidata + +Display on details page. Highlight matching relation. + +example: https://www.wikidata.org/wiki/Q78078847 + +--- +TODO + +* DONE: Add special code for matching watercourses that works like street matching +* DONE: Frontend should catch API errors and show them +* DONE: API calls should return errors in JSON + +* Run update code from systemd +* Stop Wikidata update code from crashing when it hits an error +* Add an option for 'select all' for linear features +* Add a note to details page explaining street matching +* Upload code to GitHub +* Candidates list jumps when first object is selected, because message appears + at the top the list. Can be fixed by having a message there and replacing + it. + +IsA pages +* Flesh out IsA pages +* Allow users to add extra tags to IsA +* Add option to update IsA + +Type filter +* Include type filter QIDs in URL +* Move type filter to modal box +* Show item type description + +--- +Show note about relations for tram stops and windfarms + +--- +Show dissolved, abolished or demolished date (P576) +https://map.osm.wikidata.link/map/18/40.74610/-73.99652?item=Q14707174 + +--- +Get subclasses for one item type + +``` SQL +select item_id, labels->'en'->'value' from item where jsonb_path_query_array(claims, '$."P279"[*]."mainsnak"."datavalue"."value"."id"'::jsonpath) ?| '{"Q718893"}'; +``` + +Get subclasses for items with OSM tag/key + +``` SQL +select item_id, labels->'en'->'value' + from item + where jsonb_path_query_array(claims, '$."P279"[*]."mainsnak"."datavalue"."value"."id"'::jsonpath) + ?| array(select 'Q' || item_id from item where claims ? 'P1282'); +``` + +--- +Shipyard results shouldn't include place=city +https://map.osm.wikidata.link/map/18/50.89540/-1.38243?item=Q551401 From f34226c8d7892d3049af878410f9ad2c1185d799 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 11:22:10 +0000 Subject: [PATCH 27/30] Add matcher style --- matcher.style | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 matcher.style diff --git a/matcher.style b/matcher.style new file mode 100644 index 0000000..6ae348c --- /dev/null +++ b/matcher.style @@ -0,0 +1,162 @@ +# This is the default osm2pgsql .style file that comes with osm2pgsql. +# +# A .style file has 4 columns that define how OSM objects end up in tables in +# the database and what columns are created. It interacts with the command-line +# hstore options. +# +# Columns +# ======= +# +# OsmType: This is either "node", "way" or "node,way" and indicates if this tag +# applies to nodes, ways, or both. +# +# Tag: The tag +# +# DataType: The type of the column to be created. Normally "text" +# +# Flags: Flags that indicate what table the OSM object is moved into. +# +# There are 6 possible flags. These flags are used both to indicate if a column +# should be created, and if ways with the tag are assumed to be areas. The area +# assumptions can be overridden with an area=yes/no tag +# +# polygon - Create a column for this tag, and objects with the tag are areas +# +# linear - Create a column for this tag +# +# nocolumn - Override the above and don't create a column for the tag, but do +# include objects with this tag +# +# phstore - Same as polygon,nocolumn for backward compatibility +# +# delete - Drop this tag completely and don't create a column for it. This also +# prevents the tag from being added to hstore columns +# +# nocache - Deprecated and does nothing +# +# If an object has a tag that indicates it is an area or has area=yes/1, +# osm2pgsql will try to turn it into an area. If it succeeds, it places it in +# the polygon table. If it fails (e.g. not a closed way) it places it in the +# line table. +# +# Nodes are never placed into the polygon or line table and are always placed in +# the point table. +# +# Hstore +# ====== +# +# The options --hstore, --hstore-match-only, and --hstore-all interact with +# the .style file. +# +# With --hstore any tags without a column will be added to the hstore column. +# This will also cause all objects to be kept. +# +# With --hstore-match-only the behavior for tags is the same, but objects are +# only kept if they have a non-NULL value in one of the columns. +# +# With --hstore-all all tags are added to the hstore column unless they appear +# in the style file with a delete flag, causing duplication between the normal +# columns and the hstore column. +# +# Special database columns +# ======================== +# +# There are some special database columns that if present in the .style file +# will be populated by osm2pgsql. +# +# These are +# +# z_order - datatype int4 +# +# way_area - datatype real. The area of the way, in the units of the projection +# (e.g. square mercator meters). Only applies to areas +# +# osm_user - datatype text +# osm_uid - datatype integer +# osm_version - datatype integer +# osm_changeset - datatype integer +# osm_timestamp - datatype timestamptz(0). +# Used with the --extra-attributes option to include metadata in the database. +# If importing with both --hstore and --extra-attributes the meta-data will +# end up in the tags hstore column regardless of the style file. + +# OsmType Tag DataType Flags +node,way access text linear +node,way addr:housename text linear +node,way addr:housenumber text linear +node,way addr:interpolation text linear +node,way admin_level text linear +node,way aerialway text linear +node,way aeroway text polygon +node,way amenity text polygon +node,way area text polygon # hard coded support for area=1/yes => polygon is in osm2pgsql +node,way barrier text linear +node,way bicycle text linear +node,way brand text linear +node,way bridge text linear +node,way boundary text linear +node,way building text polygon +node capital text linear +node,way construction text linear +node,way covered text linear +node,way culvert text linear +node,way cutting text linear +node,way denomination text linear +node,way disused text linear +node ele text linear +node,way embankment text linear +node,way foot text linear +node,way generator:source text linear +node,way harbour text polygon +node,way highway text linear +node,way historic text polygon +node,way horse text linear +node,way intermittent text linear +node,way junction text linear +node,way landuse text polygon +node,way layer text linear +node,way leisure text polygon +node,way lock text linear +node,way man_made text polygon +node,way military text polygon +node,way motorcar text linear +node,way name text linear +node,way natural text polygon # natural=coastline tags are discarded by a hard coded rule in osm2pgsql +node,way office text polygon +node,way oneway text linear +node,way operator text linear +node,way place text polygon +node,way population text linear +node,way power text polygon +node,way power_source text linear +node,way public_transport text polygon +node,way railway text linear +node,way ref text linear +node,way religion text linear +node,way route text linear +node,way service text linear +node,way shop text polygon +node,way sport text polygon +node,way surface text linear +node,way toll text linear +node,way tourism text polygon +node,way tower:type text linear +way tracktype text linear +node,way tunnel text linear +node,way water text polygon +node,way waterway text polygon +node,way wetland text polygon +node,way width text linear +node,way wood text linear +node,way z_order int4 linear # This is calculated during import +way way_area real linear # This is calculated during import + +# Area tags +# We don't make columns for these tags, but objects with them are areas. +# Mainly for use with hstore +way abandoned:aeroway text polygon,nocolumn +way abandoned:amenity text polygon,nocolumn +way abandoned:building text polygon,nocolumn +way abandoned:landuse text polygon,nocolumn +way abandoned:power text polygon,nocolumn +way area:highway text polygon,nocolumn From 40e9499eb62b7560517cf8f66ebca27e0b4e3543 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 13:48:26 +0000 Subject: [PATCH 28/30] Add some types and other fixes. --- update.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/update.py b/update.py index 038b736..2764470 100755 --- a/update.py +++ b/update.py @@ -5,6 +5,7 @@ import json import os import sys +import typing from time import sleep from matcher import database, model, utils, wikidata, wikidata_api @@ -12,13 +13,13 @@ from matcher import database, model, utils, wikidata, wikidata_api DB_URL = "postgresql:///matcher" database.init_db(DB_URL) -previous_max_lastrevid = 1388804050 # Q106152661 +previous_max_lastrevid = 1888214110 # Q118129609 entity_keys = {"labels", "sitelinks", "aliases", "claims", "descriptions", "lastrevid"} -def read_changes(): - qids = set() +def read_changes() -> None: + qids: set[str] = set() max_lastrevid = 0 for f in sorted(os.listdir("changes"), key=lambda f: int(f.partition(".")[0])): reply = json.load(open("changes/" + f)) @@ -43,7 +44,8 @@ def read_changes(): json.dump(entity, out) -def get_changes(): +def get_changes() -> None: + """Get recent changes.""" start = "2021-03-24T11:56:11" rccontinue = None i = 0 @@ -157,7 +159,7 @@ def handle_new(change): database.session.add(item) -def coords_equal(a, b): +def coords_equal(a: dict[str, typing.Any], b: dict[str, typing.Any]) -> bool: """Deep equality comparison of nested dicts.""" return json.dumps(a, sort_keys=True) == json.dumps(b, sort_keys=True) @@ -198,13 +200,14 @@ def handle_edit(change): setattr(item, key, entity[key]) -def update_timestamp(timestamp): +def update_timestamp(timestamp: str) -> None: + """Save timestamp to rc_timestamp.""" out = open("rc_timestamp", "w") print(timestamp, file=out) out.close() -def update_database(): +def update_database() -> None: with open("rc_timestamp") as f: start = f.read().strip() From f4b7dd681c2dec66549c7bd83d5861dbdae6709a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 13:50:11 +0000 Subject: [PATCH 29/30] Remove unused code. --- update.py | 123 ++++-------------------------------------------------- 1 file changed, 9 insertions(+), 114 deletions(-) diff --git a/update.py b/update.py index 2764470..4ebe3e0 100755 --- a/update.py +++ b/update.py @@ -3,124 +3,17 @@ """Download Wikidata recent changes and update items in local database.""" import json -import os -import sys import typing from time import sleep -from matcher import database, model, utils, wikidata, wikidata_api +from matcher import database, model, wikidata, wikidata_api DB_URL = "postgresql:///matcher" database.init_db(DB_URL) -previous_max_lastrevid = 1888214110 # Q118129609 - entity_keys = {"labels", "sitelinks", "aliases", "claims", "descriptions", "lastrevid"} -def read_changes() -> None: - qids: set[str] = set() - max_lastrevid = 0 - for f in sorted(os.listdir("changes"), key=lambda f: int(f.partition(".")[0])): - reply = json.load(open("changes/" + f)) - print(f, len(qids)) - for change in reply["query"]["recentchanges"]: - # rctype = change["type"] - title = change["title"] - revid = change["revid"] - if revid and revid > max_lastrevid: - max_lastrevid = revid - assert title.startswith("Q") - qids.add(title) - print(len(qids)) - print(max_lastrevid) - - return - - for cur in utils.chunk(qids, 50): - print(cur) - for qid, entity in wikidata_api.get_entities(cur): - with open(f"items/{qid}.json", "w") as out: - json.dump(entity, out) - - -def get_changes() -> None: - """Get recent changes.""" - start = "2021-03-24T11:56:11" - rccontinue = None - i = 0 - while True: - i += 1 - r = wikidata_api.query_wd_api(rcstart=start, rccontinue=rccontinue) - with open(f"changes/{i:06d}.json", "w") as out: - out.write(r.text) - - reply = r.json() - try: - print(reply["query"]["recentchanges"][0]["timestamp"]) - except KeyError: - print("KeyError") - - if False: - for change in reply["query"]["recentchanges"]: - # rctype = change["type"] - # if change["revid"] == 0 and change["old_revid"] == 0: - # continue - - if change["logtype"] == "delete" and change["logaction"] in { - "revision", - "delete", - "restore", - }: - continue - - if change["logtype"] == "protect" and change["logaction"] in { - "unprotect", - "protect", - }: - continue - - print(json.dumps(change, indent=2)) - sys.exit(0) - - continue - - if not change["title"].startswith("Q"): - continue # not an item - - qid = change["title"] - assert qid[1:].isdigit() - item_id = int(qid[1:]) - revid = change["revid"] - - item = model.Item.query.get(item_id) - if change["type"] == "edit" and not item: - continue - - if change["type"] == "new" and not item: - print(("new", qid)) - continue - - if not item: - print(qid) - print(json.dumps(change, indent=2)) - print((change["type"], qid, item.lastrevid, revid)) - - # print(json.dumps(reply, indent=2)) - - if "continue" not in reply: - break - - rccontinue = reply["continue"]["rccontinue"] - print(rccontinue) - sleep(1) - - -def get_timestamp(): - ts = wikidata_api.get_revision_timestamp(previous_max_lastrevid) - print(ts) - - def handle_new(change): qid = change["title"] ts = change["timestamp"] @@ -243,10 +136,12 @@ def update_database() -> None: print("finished") -# read_changes() -# get_timestamp() -# get_changes() +def main() -> None: + """Infinite loop.""" + while True: + update_database() + sleep(60) -while True: - update_database() - sleep(60) + +if __name__ == "__main__": + main() From b9b9728fc69e4fe3723c17b9f46351891ac59618 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 May 2023 14:00:45 +0000 Subject: [PATCH 30/30] Add more types. --- update.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/update.py b/update.py index 4ebe3e0..3ebee24 100755 --- a/update.py +++ b/update.py @@ -14,7 +14,17 @@ database.init_db(DB_URL) entity_keys = {"labels", "sitelinks", "aliases", "claims", "descriptions", "lastrevid"} -def handle_new(change): +class Change(typing.TypedDict): + """Dict representing an edit in recent changes.""" + + title: str + timestamp: str + redirect: dict[str, typing.Any] | None + revid: int + + +def handle_new(change: Change) -> None: + """Handle a new Wikidata item from the recent changes feed.""" qid = change["title"] ts = change["timestamp"] if change["redirect"]: @@ -57,7 +67,8 @@ def coords_equal(a: dict[str, typing.Any], b: dict[str, typing.Any]) -> bool: return json.dumps(a, sort_keys=True) == json.dumps(b, sort_keys=True) -def handle_edit(change): +def handle_edit(change: Change) -> None: + """Process an edit from recent changes.""" qid = change["title"] item = model.Item.query.get(qid[1:]) if not item: @@ -101,6 +112,7 @@ def update_timestamp(timestamp: str) -> None: def update_database() -> None: + """Check recent changes and apply updates to local mirror of Wikidata.""" with open("rc_timestamp") as f: start = f.read().strip()