Compare commits

..

No commits in common. "4446cbed6ef56346857048603d53106fdf1f8c0d" and "63d0d198eb6f4ae974ef71a2a72146d20546559a" have entirely different histories.

30 changed files with 999 additions and 1852 deletions

252
app.py
View file

@ -1,18 +1,18 @@
#!/usr/bin/python3 #!/usr/bin/python3
import hashlib import hashlib
import inspect
import itertools import itertools
import json import json
import os import os
import re import re
import socket import socket
import typing
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from typing import Iterable, Mapping
import requests.exceptions import requests.exceptions
import simplejson.errors import simplejson.errors
import werkzeug
from flask import ( from flask import (
Flask, Flask,
g, g,
@ -27,7 +27,6 @@ from requests_oauthlib import OAuth1Session
from sqlalchemy import distinct, func from sqlalchemy import distinct, func
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
from sqlalchemy.sql.expression import desc from sqlalchemy.sql.expression import desc
from werkzeug.wrappers import Response
from depicts import ( from depicts import (
artwork, artwork,
@ -55,7 +54,6 @@ from depicts.model import (
WikidataQuery, WikidataQuery,
) )
from depicts.pager import Pagination, init_pager from depicts.pager import Pagination, init_pager
from depicts.type import Entity
user_agent = "Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0" user_agent = "Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0"
@ -111,15 +109,30 @@ re_pid = re.compile(r"^P(\d+)")
@app.teardown_appcontext @app.teardown_appcontext
def shutdown_session(exception: Exception | None = None) -> None: def shutdown_session(exception=None):
database.session.remove() # type:ignore database.session.remove()
@app.errorhandler(werkzeug.exceptions.InternalServerError)
def exception_handler(e):
tb = werkzeug.debug.tbtools.get_current_traceback()
last_frame = next(frame for frame in reversed(tb.frames) if not frame.is_library)
last_frame_args = inspect.getargs(last_frame.code)
return (
render_template(
"show_error.html",
tb=tb,
last_frame=last_frame,
last_frame_args=last_frame_args,
),
500,
)
@app.template_global() @app.template_global()
def set_url_args(endpoint: str | None = None, **new_args: str) -> str: def set_url_args(endpoint=None, **new_args):
if endpoint is None: if endpoint is None:
endpoint = request.endpoint endpoint = request.endpoint
assert endpoint and request.view_args
args = request.view_args.copy() args = request.view_args.copy()
args.update(request.args) args.update(request.args)
args.update(new_args) args.update(new_args)
@ -128,25 +141,23 @@ def set_url_args(endpoint: str | None = None, **new_args: str) -> str:
@app.template_global() @app.template_global()
def current_url() -> str: def current_url():
"""Get current URL."""
assert request and request.view_args and request.endpoint
args = request.view_args.copy() args = request.view_args.copy()
args.update(request.args) args.update(request.args)
return url_for(request.endpoint, **args) return url_for(request.endpoint, **args)
@app.before_request @app.before_request
def init_profile() -> None: def init_profile():
g.profiling = [] g.profiling = []
@app.before_request @app.before_request
def global_user() -> None: def global_user():
g.user = wikidata_oauth.get_username() g.user = wikidata_oauth.get_username()
def check_for_blocks() -> None: def check_for_blocks():
if hasattr(g, "server_ip"): # already done if hasattr(g, "server_ip"): # already done
return return
hostname = app.config.get("HOSTNAME") hostname = app.config.get("HOSTNAME")
@ -161,45 +172,43 @@ def check_for_blocks() -> None:
@app.before_request @app.before_request
def get_blocks() -> None: def get_blocks():
if app.config.get("SHOW_BLOCK_ALERT") is not False: if app.config.get("SHOW_BLOCK_ALERT") is not False:
check_for_blocks() check_for_blocks()
@app.route("/find_more_setting") @app.route("/find_more_setting")
def flip_find_more() -> str: def flip_find_more():
session["no_find_more"] = not session.get("no_find_more") session["no_find_more"] = not session.get("no_find_more")
display = {True: "on", False: "off"}[not session["no_find_more"]] display = {True: "on", False: "off"}[not session["no_find_more"]]
return "flipped. find more is " + display return "flipped. find more is " + display
def existing_edit(item_id: int, depicts_id: int) -> bool: def existing_edit(item_id, depicts_id):
q = Edit.query.filter_by(artwork_id=item_id, depicts_id=depicts_id) # type: ignore q = Edit.query.filter_by(artwork_id=item_id, depicts_id=depicts_id)
return bool(q.count() != 0) return q.count() != 0
@app.route("/save/Q<int:item_id>", methods=["POST"]) @app.route("/save/Q<int:item_id>", methods=["POST"])
def save(item_id: int) -> str | Response: def save(item_id):
depicts = request.form.getlist("depicts") depicts = request.form.getlist("depicts")
username = wikidata_oauth.get_username() username = wikidata_oauth.get_username()
assert username assert username
token = wikidata_oauth.get_token() token = wikidata_oauth.get_token()
artwork_item = Item.query.get(item_id) # type: ignore artwork_item = Item.query.get(item_id)
if artwork_item is None: if artwork_item is None:
artwork_entity = mediawiki.get_entity_with_cache(f"Q{item_id}") artwork_entity = mediawiki.get_entity_with_cache(f"Q{item_id}")
artwork_item = Item( artwork_item = Item(item_id=item_id, entity=artwork_entity)
item_id=item_id, entity=typing.cast(dict[str, str], artwork_entity)
)
database.session.add(artwork_item) database.session.add(artwork_item)
database.session.commit() database.session.commit()
for depicts_qid in depicts: for depicts_qid in depicts:
depicts_id = int(depicts_qid[1:]) depicts_id = int(depicts_qid[1:])
depicts_item = DepictsItem.query.get(depicts_id) # type: ignore depicts_item = DepictsItem.query.get(depicts_id)
if depicts_item is None: if depicts_item is None:
depicts_item = wikidata_edit.create_depicts_item(depicts_id) depicts_item = wikidata_edit.create_depicts_item(depicts_id)
database.session.add(depicts_item) database.session.add(depicts_item)
@ -238,24 +247,24 @@ def save(item_id: int) -> str | Response:
@app.route("/settings", methods=["GET", "POST"]) @app.route("/settings", methods=["GET", "POST"])
def user_settings() -> str: def user_settings():
return render_template("user_settings.html") return render_template("user_settings.html")
@app.route("/test/lookup") @app.route("/test/lookup")
def test_lookup_page() -> str: def test_lookup_page():
return render_template("test_lookup.html") return render_template("test_lookup.html")
@app.route("/property/P<int:property_id>") @app.route("/property/P<int:property_id>")
def property_query_page(property_id: int) -> str: def property_query_page(property_id):
pid = f"P{property_id}" pid = f"P{property_id}"
g.title = find_more_props[pid] g.title = find_more_props[pid]
sort = request.args.get("sort") sort = request.args.get("sort")
sort_by_name = sort and sort.lower().strip() == "name" sort_by_name = sort and sort.lower().strip() == "name"
q = ( q = (
database.session.query( # type: ignore database.session.query(
Triple.object_id, func.count(func.distinct(Triple.subject_id)).label("c") Triple.object_id, func.count(func.distinct(Triple.subject_id)).label("c")
) )
.filter_by(predicate_id=property_id) .filter_by(predicate_id=property_id)
@ -293,19 +302,15 @@ def property_query_page(property_id: int) -> str:
@app.route("/") @app.route("/")
def start() -> Response: def start():
return random_artwork() return random_artwork()
@app.route("/next") @app.route("/next")
def random_artwork() -> Response: def random_artwork():
found = None found = None
while True: while True:
q = ( q = Item.query.filter_by(is_artwork=True).order_by(func.random()).limit(30)
Item.query.filter_by(is_artwork=True) # type: ignore
.order_by(func.random())
.limit(30)
)
for item in q: for item in q:
has_depicts = "P180" in item.entity["claims"] has_depicts = "P180" in item.entity["claims"]
if has_depicts: if has_depicts:
@ -320,7 +325,7 @@ def random_artwork() -> Response:
@app.route("/oauth/start") @app.route("/oauth/start")
def start_oauth() -> Response: def start_oauth():
next_page = request.args.get("next") next_page = request.args.get("next")
if next_page: if next_page:
session["after_login"] = next_page session["after_login"] = next_page
@ -344,7 +349,7 @@ def start_oauth() -> Response:
@app.route("/oauth/callback", methods=["GET"]) @app.route("/oauth/callback", methods=["GET"])
def oauth_callback() -> Response: def oauth_callback():
base_url = "https://www.wikidata.org/w/index.php" base_url = "https://www.wikidata.org/w/index.php"
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
@ -376,17 +381,17 @@ def oauth_callback() -> Response:
@app.route("/oauth/disconnect") @app.route("/oauth/disconnect")
def oauth_disconnect() -> Response: def oauth_disconnect():
for key in "owner_key", "owner_secret", "username", "after_login": for key in "owner_key", "owner_secret", "username", "after_login":
if key in session: if key in session:
del session[key] del session[key]
return redirect(url_for("browse_page")) return redirect(url_for("browse_page"))
def create_claim(artwork_id: int, depicts_id: int, token: str) -> requests.Response: def create_claim(artwork_id, depicts_id, token):
artwork_qid = f"Q{artwork_id}" artwork_qid = f"Q{artwork_id}"
value = json.dumps({"entity-type": "item", "numeric-id": depicts_id}) value = json.dumps({"entity-type": "item", "numeric-id": depicts_id})
params: dict[str, str | int] = { params = {
"action": "wbcreateclaim", "action": "wbcreateclaim",
"entity": artwork_qid, "entity": artwork_qid,
"property": "P180", "property": "P180",
@ -396,11 +401,10 @@ def create_claim(artwork_id: int, depicts_id: int, token: str) -> requests.Respo
"format": "json", "format": "json",
"formatversion": 2, "formatversion": 2,
} }
r: requests.Response = wikidata_oauth.api_post_request(params) return wikidata_oauth.api_post_request(params)
return r
def image_with_cache(qid: str, image_filename: str, width: int) -> dict[str, str]: def image_with_cache(qid, image_filename, width):
filename = f"cache/{qid}_{width}_image.json" filename = f"cache/{qid}_{width}_image.json"
detail = json.load(open(filename)) if os.path.exists(filename) else {} detail = json.load(open(filename)) if os.path.exists(filename) else {}
@ -412,11 +416,10 @@ def image_with_cache(qid: str, image_filename: str, width: int) -> dict[str, str
detail = commons.image_detail([image_filename], thumbwidth=width) detail = commons.image_detail([image_filename], thumbwidth=width)
json.dump(detail, open(filename, "w"), indent=2) json.dump(detail, open(filename, "w"), indent=2)
image: dict[str, str] = detail.get(image_filename) return detail.get(image_filename)
return image
def existing_depicts_from_entity(entity: Entity) -> list[dict[str, typing.Any]]: def existing_depicts_from_entity(entity):
if "P180" not in entity["claims"]: if "P180" not in entity["claims"]:
return [] return []
existing = [] existing = []
@ -427,7 +430,7 @@ def existing_depicts_from_entity(entity: Entity) -> list[dict[str, typing.Any]]:
item_id = claim["mainsnak"]["datavalue"]["value"]["numeric-id"] item_id = claim["mainsnak"]["datavalue"]["value"]["numeric-id"]
item = DepictsItem.query.get(item_id) # type: ignore item = DepictsItem.query.get(item_id)
if not item: if not item:
item = wikidata_edit.create_depicts_item(item_id) item = wikidata_edit.create_depicts_item(item_id)
database.session.add(item) database.session.add(item)
@ -445,32 +448,27 @@ def existing_depicts_from_entity(entity: Entity) -> list[dict[str, typing.Any]]:
return existing return existing
def get_institution(entity: Entity, other: Mapping[str, str | None]) -> str | None: def get_institution(entity, other):
if "P276" in entity["claims"]: if "P276" in entity["claims"]:
location = wikibase.first_datavalue(entity, "P276") location = wikibase.first_datavalue(entity, "P276")
assert isinstance(location, dict)
if location: if location:
return typing.cast(str, other.get(location["id"])) return other.get(location["id"])
if "P195" in entity["claims"]: if "P195" in entity["claims"]:
collection = wikibase.first_datavalue(entity, "P195") collection = wikibase.first_datavalue(entity, "P195")
assert isinstance(collection, dict)
if collection: if collection:
return typing.cast(str, other.get(collection["id"])) return other.get(collection["id"])
return None
@app.route("/item/Q<int:item_id>") @app.route("/item/Q<int:item_id>")
def item_page(item_id: int) -> str | Response: def item_page(item_id):
qid = f"Q{item_id}" qid = f"Q{item_id}"
g.qid = qid g.qid = qid
item = artwork.Artwork(qid) item = artwork.Artwork(qid)
from_redirect = qid in session and session.pop(qid) == "from redirect" from_redirect = qid in session and session.pop(qid) == "from redirect"
entity = mediawiki.get_entity_with_cache(qid, refresh=not from_redirect) entity = mediawiki.get_entity_with_cache(qid, refresh=not from_redirect)
assert entity
if "redirects" in entity: if "redirects" in entity:
redirect_to_item_id = int(entity["redirects"]["to"][1:]) redirect_to_item_id = int(entity["redirects"]["to"][1:])
assert request.endpoint
return redirect(url_for(request.endpoint, item_id=redirect_to_item_id)) return redirect(url_for(request.endpoint, item_id=redirect_to_item_id))
existing_depicts = existing_depicts_from_entity(entity) existing_depicts = existing_depicts_from_entity(entity)
@ -496,7 +494,7 @@ def item_page(item_id: int) -> str | Response:
label_languages = label_and_language["languages"] if label_and_language else [] label_languages = label_and_language["languages"] if label_and_language else []
show_translation_links = all(lang.code != "en" for lang in label_languages) show_translation_links = all(lang.code != "en" for lang in label_languages)
artwork_item = Item.query.get(item_id) # type: ignore artwork_item = Item.query.get(item_id)
if artwork_item is None: if artwork_item is None:
if not wdqs.is_artificial_physical_object(qid): if not wdqs.is_artificial_physical_object(qid):
return render_template( return render_template(
@ -519,7 +517,7 @@ def item_page(item_id: int) -> str | Response:
artwork_item = Item( artwork_item = Item(
item_id=item_id, item_id=item_id,
entity=typing.cast(dict[str, str], entity), entity=entity,
lastrevid=entity["lastrevid"], lastrevid=entity["lastrevid"],
modified=modified, modified=modified,
) )
@ -527,9 +525,7 @@ def item_page(item_id: int) -> str | Response:
catalog = wd_catalog.get_catalog_from_artwork(entity) catalog = wd_catalog.get_catalog_from_artwork(entity)
if not catalog.get("institution"): if not catalog.get("institution"):
institution = get_institution(entity, other) catalog["institution"] = get_institution(entity, other)
assert institution
catalog["institution"] = institution
return render_template( return render_template(
"item.html", "item.html",
@ -552,13 +548,11 @@ def item_page(item_id: int) -> str | Response:
) )
def get_languages(codes: typing.Iterable[str]) -> typing.Any: def get_languages(codes):
return Language.query.filter( # type: ignore return Language.query.filter(Language.wikimedia_language_code.in_(codes))
Language.wikimedia_language_code.in_(codes)
)
def get_entity_label_and_language(entity: Entity) -> dict[str, typing.Any] | None: def get_entity_label_and_language(entity):
""" """
Look for a useful label and return it with a list of languages that have that label. Look for a useful label and return it with a list of languages that have that label.
@ -579,10 +573,8 @@ def get_entity_label_and_language(entity: Entity) -> dict[str, typing.Any] | Non
label, languages = list(group_by_label.items())[0] label, languages = list(group_by_label.items())[0]
return {"label": label, "languages": get_languages(languages)} return {"label": label, "languages": get_languages(languages)}
return None
def get_labels(keys, name=None):
def get_labels(keys: typing.Iterable[str], name: str | None = None) -> dict[str, str]:
keys = sorted(keys, key=lambda i: int(i[1:])) keys = sorted(keys, key=lambda i: int(i[1:]))
if name is None: if name is None:
name = hashlib.md5("_".join(keys).encode("utf-8")).hexdigest() name = hashlib.md5("_".join(keys).encode("utf-8")).hexdigest()
@ -598,13 +590,10 @@ def get_labels(keys: typing.Iterable[str], name: str | None = None) -> dict[str,
json.dump({"keys": keys, "labels": labels}, open(filename, "w"), indent=2) json.dump({"keys": keys, "labels": labels}, open(filename, "w"), indent=2)
return { return {entity["id"]: wikibase.get_entity_label(entity) for entity in labels}
entity["id"]: wikibase.get_entity_label(entity) or "no English label"
for entity in labels
}
def get_labels_db(keys: Iterable[str]): def get_labels_db(keys):
keys = set(keys) keys = set(keys)
labels = {} labels = {}
missing = set() missing = set()
@ -612,7 +601,7 @@ def get_labels_db(keys: Iterable[str]):
m = re_qid.match(qid) m = re_qid.match(qid)
if m: if m:
item_id = int(m.group(1)) item_id = int(m.group(1))
item = Item.query.get(item_id) # type: ignore item = Item.query.get(item_id)
if item: if item:
labels[qid] = item.label labels[qid] = item.label
continue continue
@ -630,8 +619,8 @@ def get_labels_db(keys: Iterable[str]):
modified = datetime.strptime(entity["modified"], "%Y-%m-%dT%H:%M:%SZ") modified = datetime.strptime(entity["modified"], "%Y-%m-%dT%H:%M:%SZ")
# FIXME: check if the item is an artwork and set is_artwork correctly # FIXME: check if the item is an artwork and set is_artwork correctly
item = Item( # type: ignore item = Item(
item_id=int(qid[1:]), item_id=qid[1:],
entity=entity, entity=entity,
lastrevid=entity["lastrevid"], lastrevid=entity["lastrevid"],
modified=modified, modified=modified,
@ -646,7 +635,7 @@ def get_labels_db(keys: Iterable[str]):
return labels return labels
def build_other_set(entity: Entity) -> set[str]: def build_other_set(entity):
other_items = set() other_items = set()
for key in find_more_props.keys(): for key in find_more_props.keys():
for claim in entity["claims"].get(key, []): for claim in entity["claims"].get(key, []):
@ -656,24 +645,20 @@ def build_other_set(entity: Entity) -> set[str]:
return other_items return other_items
def get_other(entity: Entity) -> Mapping[str, str | None]: def get_other(entity):
other_items = build_other_set(entity) other_items = build_other_set(entity)
return get_labels(other_items) return get_labels(other_items)
@app.route("/edits") @app.route("/edits")
def list_edits() -> str: def list_edits():
q = Edit.query.order_by(Edit.timestamp.desc()) # type: ignore q = Edit.query.order_by(Edit.timestamp.desc())
page = utils.get_int_arg("page") or 1 page = utils.get_int_arg("page") or 1
pager = Pagination(page, 100, q.count()) pager = Pagination(page, 100, q.count())
item_count = database.session.query( item_count = database.session.query(func.count(distinct(Edit.artwork_id))).scalar()
func.count(distinct(Edit.artwork_id))
).scalar() # type: ignore
user_count = database.session.query( user_count = database.session.query(func.count(distinct(Edit.username))).scalar()
func.count(distinct(Edit.username))
).scalar() # type: ignore
return render_template( return render_template(
"list_edits.html", "list_edits.html",
@ -685,13 +670,11 @@ def list_edits() -> str:
@app.route("/user/<username>") @app.route("/user/<username>")
def user_page(username: str) -> str: def user_page(username):
edit_list = Edit.query.filter_by(username=username).order_by( # type: ignore edit_list = Edit.query.filter_by(username=username).order_by(Edit.timestamp.desc())
Edit.timestamp.desc()
)
item_count = ( item_count = (
database.session.query(func.count(distinct(Edit.artwork_id))) # type: ignore database.session.query(func.count(distinct(Edit.artwork_id)))
.filter_by(username=username) .filter_by(username=username)
.scalar() .scalar()
) )
@ -699,22 +682,20 @@ def user_page(username: str) -> str:
return render_template( return render_template(
"user_page.html", "user_page.html",
username=username, username=username,
edits=Edit.query, # type: ignore edits=Edit.query,
edit_list=edit_list, edit_list=edit_list,
item_count=item_count, item_count=item_count,
) )
@app.route("/next/Q<int:item_id>") @app.route("/next/Q<int:item_id>")
def next_page(item_id: int) -> str: def next_page(item_id):
qid = f"Q{item_id}" qid = f"Q{item_id}"
entity = mediawiki.get_entity_with_cache(qid) entity = mediawiki.get_entity_with_cache(qid)
assert entity
width = 800 width = 800
image_filename = wikibase.first_datavalue(entity, "P18") image_filename = wikibase.first_datavalue(entity, "P18")
assert isinstance(image_filename, str)
image = image_with_cache(qid, image_filename, width) image = image_with_cache(qid, image_filename, width)
label = wikibase.get_entity_label(entity) label = wikibase.get_entity_label(entity)
@ -775,13 +756,13 @@ def next_page(item_id: int) -> str:
@app.route("/P<int:property_id>/Q<int:item_id>") @app.route("/P<int:property_id>/Q<int:item_id>")
def find_more_page(property_id: int, item_id: int) -> Response: def find_more_page(property_id, item_id):
pid, qid = f"P{property_id}", f"Q{item_id}" pid, qid = f"P{property_id}", f"Q{item_id}"
return redirect(url_for("browse_page", **{pid: qid})) # type: ignore return redirect(url_for("browse_page", **{pid: qid}))
@app.route("/toolinfo.json") @app.route("/toolinfo.json")
def tool_info() -> Response: def tool_info():
info = { info = {
"name": "wade", "name": "wade",
"title": "Wikidata Art Depiction Explorer", "title": "Wikidata Art Depiction Explorer",
@ -794,16 +775,14 @@ def tool_info() -> Response:
return jsonify(info) return jsonify(info)
def get_facets(params) -> dict[str, typing.Any]: def get_facets(params):
properties = [pid for pid in find_more_props.keys() if pid not in request.args] properties = [pid for pid in find_more_props.keys() if pid not in request.args]
bindings = wdqs.run_from_template_with_cache( bindings = wdqs.run_from_template_with_cache(
"query/facet.sparql", params=params, isa_list=isa_list, properties=properties "query/facet.sparql", params=params, isa_list=isa_list, properties=properties
) )
facets: dict[str, list[dict[str, str | int]]] = { facets = {key: [] for key in find_more_props.keys()}
key: [] for key in find_more_props.keys()
}
for row in bindings: for row in bindings:
pid = row["property"]["value"].rpartition("/")[2] pid = row["property"]["value"].rpartition("/")[2]
qid = row["object"]["value"].rpartition("/")[2] qid = row["object"]["value"].rpartition("/")[2]
@ -821,7 +800,7 @@ def get_facets(params) -> dict[str, typing.Any]:
} }
def get_artwork_params() -> list[tuple[str, str]]: def get_artwork_params():
params = [] params = []
for pid, qid in request.args.items(): for pid, qid in request.args.items():
m = re_pid.match(pid) m = re_pid.match(pid)
@ -838,14 +817,14 @@ def get_artwork_params() -> list[tuple[str, str]]:
return params return params
def filter_artwork(params: list[tuple[str, str]]) -> list[wdqs.Row]: def filter_artwork(params):
return wdqs.run_from_template_with_cache( return wdqs.run_from_template_with_cache(
"query/find_more.sparql", params=params, isa_list=isa_list "query/find_more.sparql", params=params, isa_list=isa_list
) )
@app.route("/catalog") @app.route("/catalog")
def catalog_page() -> str: def catalog_page():
params = get_artwork_params() params = get_artwork_params()
bindings = filter_artwork(params) bindings = filter_artwork(params)
page = utils.get_int_arg("page") or 1 page = utils.get_int_arg("page") or 1
@ -858,7 +837,7 @@ def catalog_page() -> str:
qids = [f"Q{item_id}" for item_id in sorted(item_ids)] qids = [f"Q{item_id}" for item_id in sorted(item_ids)]
items = [Item.query.get(item_id) for item_id in item_ids] # type: ignore items = [Item.query.get(item_id) for item_id in item_ids]
entities = mediawiki.get_entities_with_cache(qids) entities = mediawiki.get_entities_with_cache(qids)
@ -899,9 +878,7 @@ def catalog_page() -> str:
) )
def get_image_detail_with_cache( def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=False):
items, cache_name: str, thumbwidth: int | None = None, refresh: bool = False
):
filenames = [cur.image_filename() for cur in items] filenames = [cur.image_filename() for cur in items]
if thumbwidth is None: if thumbwidth is None:
@ -925,9 +902,9 @@ def get_image_detail_with_cache(
return detail return detail
def browse_index() -> str: def browse_index():
q = ( q = (
database.session.query( # type: ignore database.session.query(
Triple.predicate_id, func.count(func.distinct(Triple.object_id)) Triple.predicate_id, func.count(func.distinct(Triple.object_id))
) )
.join(Item, Triple.subject_id == Item.item_id) .join(Item, Triple.subject_id == Item.item_id)
@ -941,13 +918,13 @@ def browse_index() -> str:
@app.route("/debug/show_user") @app.route("/debug/show_user")
def debug_show_user() -> str: def debug_show_user():
userinfo = wikidata_oauth.userinfo_call() userinfo = wikidata_oauth.userinfo_call()
return "<pre>" + json.dumps(userinfo, indent=2) + "</pre>" return "<pre>" + json.dumps(userinfo, indent=2) + "</pre>"
@app.route("/browse/facets.json") @app.route("/browse/facets.json")
def browse_facets() -> Response: def browse_facets():
params = get_artwork_params() params = get_artwork_params()
if not params: if not params:
return jsonify(notice="facet criteria missing") return jsonify(notice="facet criteria missing")
@ -963,7 +940,7 @@ def browse_facets() -> Response:
def get_db_items(params): def get_db_items(params):
"""Get items for browse page based on criteria.""" """Get items for browse page based on criteria."""
q = Item.query.filter_by(is_artwork=True) # type: ignore q = Item.query.filter_by(is_artwork=True)
for pid, qid in params: for pid, qid in params:
q = q.join(Triple, Item.item_id == Triple.subject_id, aliased=True).filter( q = q.join(Triple, Item.item_id == Triple.subject_id, aliased=True).filter(
Triple.predicate_id == pid[1:], Triple.object_id == qid[1:] Triple.predicate_id == pid[1:], Triple.object_id == qid[1:]
@ -978,9 +955,7 @@ def get_db_facets(params):
facet_limit = 18 facet_limit = 18
for pid, qid in params: for pid, qid in params:
q = q.join( # type: ignore q = q.join(Triple, t.subject_id == Triple.subject_id, aliased=True).filter(
Triple, t.subject_id == Triple.subject_id, aliased=True
).filter(
Triple.predicate_id == pid[1:], Triple.predicate_id == pid[1:],
Triple.object_id == qid[1:], Triple.object_id == qid[1:],
t.predicate_id != pid[1:], t.predicate_id != pid[1:],
@ -992,9 +967,9 @@ def get_db_facets(params):
results = sorted(tuple(row) for row in q.all()) results = sorted(tuple(row) for row in q.all())
facet_list = {} facet_list = {}
subject_qids: set[str] = set() subject_qids = set()
for predicate_id, x in itertools.groupby(results, lambda row: row[0]): for predicate_id, x in itertools.groupby(results, lambda row: row[0]):
hits = sorted(x, key=lambda row: row[1], reverse=True) hits = sorted(list(x), key=lambda row: row[1], reverse=True)
values = [ values = [
{"count": count, "qid": f"Q{value}"} {"count": count, "qid": f"Q{value}"}
for _, count, value in hits[:facet_limit] for _, count, value in hits[:facet_limit]
@ -1012,7 +987,7 @@ def get_db_facets(params):
@app.route("/browse") @app.route("/browse")
def browse_page() -> str: def browse_page():
page_size = 45 page_size = 45
params = get_artwork_params() params = get_artwork_params()
@ -1076,7 +1051,7 @@ def browse_page() -> str:
@app.route("/find_more.json") @app.route("/find_more.json")
def find_more_json() -> Response: def find_more_json():
pid = request.args.get("pid") pid = request.args.get("pid")
qid_list = request.args.getlist("qid") qid_list = request.args.getlist("qid")
limit = 6 limit = 6
@ -1115,10 +1090,7 @@ def find_more_json() -> Response:
return jsonify(items=items) return jsonify(items=items)
Hit = dict[str, str | int | None] def wikibase_search(terms):
def wikibase_search(terms: str) -> list[Hit]:
hits = [] hits = []
r = mediawiki.api_call( r = mediawiki.api_call(
{ {
@ -1142,10 +1114,10 @@ def wikibase_search(terms: str) -> list[Hit]:
return hits return hits
def add_images_to_depicts_lookup(hits: list[dict[str, str]]) -> None: def add_images_to_depicts_lookup(hits):
qid_to_item = {hit["qid"]: hit for hit in hits} qid_to_item = {hit["qid"]: hit for hit in hits}
all_qids = [hit["qid"] for hit in hits] all_qids = [hit["qid"] for hit in hits]
entities: list[Entity] = mediawiki.get_entities_with_cache(all_qids) entities = mediawiki.get_entities_with_cache(all_qids)
for entity in entities: for entity in entities:
qid = entity["id"] qid = entity["id"]
@ -1172,7 +1144,7 @@ def add_images_to_depicts_lookup(hits: list[dict[str, str]]) -> None:
@app.route("/lookup") @app.route("/lookup")
def depicts_lookup() -> Response: def depicts_lookup():
terms = request.args.get("terms") terms = request.args.get("terms")
if not terms: if not terms:
return jsonify(error="terms parameter is required") return jsonify(error="terms parameter is required")
@ -1187,7 +1159,7 @@ def depicts_lookup() -> Response:
item_ids = [] item_ids = []
hits = [] hits = []
q1 = DepictsItem.query.filter(DepictsItem.label.ilike(terms + "%")) # type: ignore q1 = DepictsItem.query.filter(DepictsItem.label.ilike(terms + "%"))
seen = set() seen = set()
for item in q1: for item in q1:
hit = { hit = {
@ -1201,9 +1173,7 @@ def depicts_lookup() -> Response:
seen.add(item.qid) seen.add(item.qid)
cls = DepictsItemAltLabel cls = DepictsItemAltLabel
q2 = cls.query.filter( # type: ignore q2 = cls.query.filter(cls.alt_label.ilike(terms + "%"), ~cls.item_id.in_(item_ids))
cls.alt_label.ilike(terms + "%"), ~cls.item_id.in_(item_ids)
)
for alt in q2: for alt in q2:
item = alt.item item = alt.item
@ -1238,12 +1208,12 @@ def depicts_lookup() -> Response:
@app.route("/report/missing_image") @app.route("/report/missing_image")
def missing_image_report() -> str: def missing_image_report():
limit = utils.get_int_arg("limit") or 1000 limit = utils.get_int_arg("limit") or 1000
q = DepictsItem.query.order_by(DepictsItem.count.desc()).limit(limit) # type:ignore q = DepictsItem.query.order_by(DepictsItem.count.desc()).limit(limit)
qids = [item.qid for item in q] qids = [item.qid for item in q]
entities: dict[str, Entity] = mediawiki.get_entities_dict_with_cache(qids) entities = mediawiki.get_entities_dict_with_cache(qids)
item_list = [] item_list = []
@ -1259,19 +1229,19 @@ def missing_image_report() -> str:
@app.route("/report/wdqs") @app.route("/report/wdqs")
def wikidata_query_list() -> str: def wikidata_query_list():
q = WikidataQuery.query.order_by(WikidataQuery.start_time.desc()) # type: ignore q = WikidataQuery.query.order_by(WikidataQuery.start_time.desc())
return render_template("query_list.html", q=q) return render_template("query_list.html", q=q)
@app.route("/report/blocks") @app.route("/report/blocks")
def server_block_report() -> str: def server_block_report():
check_for_blocks() check_for_blocks()
return render_template("block_report.html") return render_template("block_report.html")
@app.route("/fixture/save_error") @app.route("/fixture/save_error")
def save_error_fixture() -> str: def save_error_fixture():
error = fixtures.save_error()["error"] error = fixtures.save_error()["error"]
return render_template("save_error.html", error=error) return render_template("save_error.html", error=error)

View file

@ -1,150 +1,106 @@
"""Class to represent artwork."""
from . import mediawiki from . import mediawiki
from .type import Claims, Entity, Sitelinks
class Artwork: class Artwork:
"""Artwork.""" def __init__(self, qid):
self.entity = mediawiki.get_entity_with_cache(qid)
entity: Entity
artist_entities: list[Entity]
def __init__(self, qid: str) -> None:
"""Init."""
entity = mediawiki.get_entity_with_cache(qid)
assert entity
self.entity = entity
self.item_id = int(qid[1:]) self.item_id = int(qid[1:])
sites = ["commons", "enwiki"] sites = ['commons', 'enwiki']
self.parent_categories = {site: {} for site in sites} self.parent_categories = {site: {} for site in sites}
@property @property
def image_filename(self) -> str | None: def image_filename(self):
"""Image filename.""" if 'P18' in self.entity['claims']:
if "P18" in self.entity["claims"]: return self.entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
f: str = self.entity["claims"]["P18"][0]["mainsnak"]["datavalue"]["value"]
return f
else:
return None
@property @property
def display_title(self) -> str: def display_title(self):
"""Display title.""" if 'en' not in self.entity['labels']:
return ( return self.qid
f"{self.en_title} ({self.qid})" return f'{self.en_title} ({self.qid})'
if "en" in self.entity["labels"]
else self.qid
)
@property @property
def url(self) -> str: def url(self):
"""Wikidata item URL.""" return 'https://www.wikidata.org/wiki/' + self.qid
return "https://www.wikidata.org/wiki/" + self.qid
def get_artist_entities(self) -> None: def get_artist_entities(self):
"""Get artist entities."""
self.artist_entities = [] self.artist_entities = []
for artist in self.artists_claim: for artist in self.artists_claim:
artist_entity = mediawiki.get_entity(artist["id"]) artist_qid = artist['id']
assert artist_entity self.artist_entities.append(mediawiki.get_entity(artist_qid))
self.artist_entities.append(artist_entity)
def artist_labels(self) -> list[str]: def artist_labels(self):
"""Get artist Wikidata item labels.""" if not hasattr(self, 'artist_entities'):
if not hasattr(self, "artist_entities"):
self.get_artist_entities() self.get_artist_entities()
return [artist["labels"]["en"]["value"] for artist in self.artist_entities] return [artist['labels']['en']['value'] for artist in self.artist_entities]
@property @property
def commons_cats(self) -> list[str]: def commons_cats(self):
return [ return [i['mainsnak']['datavalue']['value']
i["mainsnak"]["datavalue"]["value"] for i in self.entity['claims'].get('P373', [])]
for i in self.entity["claims"].get("P373", [])
]
@property @property
def commons_sitelink(self) -> str | None: def commons_sitelink(self):
"""Wikimedia sitelink.""" return self.sitelinks['commons']['value'] if 'commons' in self.sitelinks else None
return (
self.sitelinks["commons"]["value"] if "commons" in self.sitelinks else None
)
@property @property
def en_title(self) -> str: def en_title(self):
if "en" in self.entity["labels"]: if 'en' in self.entity['labels']:
title: str = self.entity["labels"]["en"]["value"] return self.entity['labels']['en']['value']
return title
else: else:
return self.qid return self.qid
@property @property
def artists_claim(self) -> list[dict[str, str]]: def artists_claim(self):
return [ return [image['mainsnak']['datavalue']['value']
image["mainsnak"]["datavalue"]["value"] for image in self.entity['claims'].get('P170', [])]
for image in self.entity["claims"].get("P170", [])
]
@property @property
def artists(self) -> list[dict[str, str]]: def artists(self):
if not hasattr(self, "artist_entities"): if not hasattr(self, 'artist_entities'):
self.get_artist_entities() self.get_artist_entities()
items = [ items = [image['mainsnak']['datavalue']['value']
image["mainsnak"]["datavalue"]["value"] for image in self.entity['claims'].get('P170', [])]
for image in self.entity["claims"].get("P170", [])
]
lookup = {artist["id"]: artist["labels"] for artist in self.artist_entities} lookup = {artist['id']: artist['labels'] for artist in self.artist_entities}
for item in items: for item in items:
item["labels"] = lookup[item["id"]] item['labels'] = lookup[item['id']]
return items return items
@property @property
def qid(self) -> str: def qid(self):
"""Wikidata item QID.""" return f'Q{self.item_id}'
return f"Q{self.item_id}"
@property @property
def commons_filenames(self) -> list[str]: def commons_filenames(self):
"""Filenames of images on Wikimedia Commons.""" return [image['mainsnak']['datavalue']['value']
return [ for image in self.entity['claims'].get('P18', [])]
image["mainsnak"]["datavalue"]["value"]
for image in self.entity["claims"].get("P18", [])
]
def commons_cat_from_sitelink(self) -> str | None: def commons_cat_from_sitelink(self):
ns = "Category:" ns = 'Category:'
if not self.commons_sitelink or not self.commons_sitelink.startswith(ns): if not self.commons_sitelink or not self.commons_sitelink.startswith(ns):
return None return
return self.commons_sitelink[len(ns) :] return self.commons_sitelink[len(ns):]
@property @property
def enwiki_url(self) -> str | None: def enwiki_url(self):
"""URL for English Wikipedia article about artwork."""
enwiki = self.enwiki enwiki = self.enwiki
assert enwiki if not enwiki:
return ( return
"https://en.wikipedia.org/wiki/" + enwiki.replace(" ", "_") return 'https://en.wikipedia.org/wiki/' + enwiki.replace(' ', '_')
if not enwiki
else None
)
@property @property
def sitelinks(self) -> Sitelinks: def sitelinks(self):
"""Item sitelinks.""" return self.entity['sitelinks']
return self.entity["sitelinks"]
@property @property
def claims(self) -> Claims: def claims(self):
"""Item claims.""" return self.entity['claims']
return self.entity["claims"]
@property @property
def enwiki(self) -> str | None: def enwiki(self):
"""Article title on English Wikipedia.""" return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
return self.sitelinks["enwiki"]["title"] if "enwiki" in self.sitelinks else None

View file

@ -1,51 +1,32 @@
"""Barnes Foundation (Q808462) - art museum in Philadephia, Pennsylvania."""
import json
import os
import typing
import requests import requests
import os
import json
from .type import CatalogDict def get_json(catalog_id):
filename = f'cache/barnesfoundation_{catalog_id}.html'
JsonData = dict[str, dict[str, typing.Any]] url = 'https://collection.barnesfoundation.org/api/search'
body = {"query": {"bool": {"filter": {"exists": {"field": "imageSecret"}},
def get_json(catalog_id: str | int) -> JsonData: "must": {"match": {"_id": int(catalog_id)}}}}}
"""Get JSON from website and cache."""
filename = f"cache/barnesfoundation_{catalog_id}.html"
url = "https://collection.barnesfoundation.org/api/search"
body = {
"query": {
"bool": {
"filter": {"exists": {"field": "imageSecret"}},
"must": {"match": {"_id": int(catalog_id)}},
}
}
}
if os.path.exists(filename): if os.path.exists(filename):
return typing.cast(JsonData, json.load(open(filename))) return json.load(open(filename))
r = requests.get(url, params={"body": json.dumps(body)}) else:
r = requests.get(url, params={'body': json.dumps(body)})
print(r.url) print(r.url)
open(filename, "w").write(r.text) open(filename, 'w').write(r.text)
return typing.cast(JsonData, r.json()) return r.json()
def parse_catalog(data):
def parse_catalog(data: JsonData) -> CatalogDict: hit = data['hits']['hits'][0]['_source']
"""Parse catalog JSON."""
hit = data["hits"]["hits"][0]["_source"]
return { return {
"institution": "Barnes Foundation", 'institution': 'Barnes Foundation',
"description": hit["shortDescription"], 'description': hit['shortDescription'],
"keywords": [tag["tag"] for tag in hit["tags"]], 'keywords': [tag['tag'] for tag in hit['tags']],
} }
def get_catalog(catalog_id):
def get_catalog(catalog_id: str | int) -> CatalogDict:
"""Lookup artwork using catalog ID and return keywords."""
data = get_json(catalog_id) data = get_json(catalog_id)
return parse_catalog(data) return parse_catalog(data)

View file

@ -1,71 +1,49 @@
import calendar from . import utils
import re import re
import calendar
from . import artwork, utils month_pattern = '|'.join(m for m in calendar.month_name if m)
re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
month_pattern = "|".join(m for m in calendar.month_name if m)
re_date_based = re.compile(
r"^(\d{4}-\d{2}-\d{2}|("
+ month_pattern
+ r") \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) "
)
ns_cat = "Category:"
ns_cat = 'Category:'
class Category: class Category:
"""Category.""" def __init__(self, title, site):
title: str
site: str
item: artwork.Artwork | None
def __init__(self, title: str, site: str):
"""Init."""
if title.startswith(ns_cat): if title.startswith(ns_cat):
title = title[len(ns_cat) :] title = title[len(ns_cat):]
self.title = title self.title = title
self.site = site self.site = site
self.item = None self.item = None
def __repr__(self) -> str: def __repr__(self):
"""Repr.""" return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
return f"{self.__class__.__name__}({self.title!r}, {self.site!r})"
def set_item(self, item: artwork.Artwork | None) -> None: def set_item(self, item):
self.item = item self.item = item
@property @property
def url(self) -> str | None: def url(self):
"""Category URL.""" return utils.wiki_url(self.title, self.site, ns='Category')
assert self.title and self.site
return utils.wiki_url(self.title, self.site, ns="Category")
def date_based(self) -> bool: def date_based(self):
"""Category title is date based."""
return bool(re_date_based.match(self.title)) return bool(re_date_based.match(self.title))
def contains_artist_name(self) -> bool: def contains_artist_name(self):
"""Category title contains artists name."""
if not self.item: if not self.item:
return False return
return any( return any(artist.lower() in self.title.lower()
artist.lower() in self.title.lower() for artist in self.item.artist_labels() for artist in self.item.artist_labels())
)
def parents(self) -> list["Category"]: def parents(self):
"""Parent categories."""
if not self.item: if not self.item:
return [] return []
return self.item.parent_categories[self.site].get(self.title, []) return self.item.parent_categories[self.site].get(self.title, [])
def is_exhibition(self) -> bool: def is_exhibition(self):
"""Category represents art exhibition.""" return any(parent.title.startswith('Art exhibitions ')
return any( for parent in self.parents())
parent.title.startswith("Art exhibitions ") for parent in self.parents()
)
def names_for_wikidata(self) -> list[str]: def names_for_wikidata(self):
highlight = self.check() highlight = self.check()
interesting = len(highlight) > 1 interesting = len(highlight) > 1
@ -80,7 +58,7 @@ class Category:
continue continue
title = text.strip() title = text.strip()
title = title[0].upper() + title[1:] title = title[0].upper() + title[1:]
for sep in " with ", " at ", " wearing ": for sep in ' with ', ' at ', ' wearing ':
if sep in title: if sep in title:
before, _, after = title.partition(sep) before, _, after = title.partition(sep)
names = [] names = []
@ -88,76 +66,44 @@ class Category:
names += utils.also_singular(x) names += utils.also_singular(x)
return names return names
return utils.also_singular(title) return utils.also_singular(title)
return []
def urls_for_wikidata(self) -> list[str]: def urls_for_wikidata(self):
return [ return [utils.wiki_url(name, self.site, ns='Category')
utils.wiki_url(name, self.site, ns="Category") for name in self.names_for_wikidata()]
for name in self.names_for_wikidata()
]
def check(self) -> list[tuple[bool, str]]: def check(self):
cat = self.title cat = self.title
lc_cat = cat.lower() lc_cat = cat.lower()
by_endings = [ by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
"title", 'medium', 'year', 'painter']
"technique",
"period",
"century",
"country",
"movement",
"medium",
"year",
"painter",
]
if self.item: if self.item:
by_endings += self.item.artist_labels() by_endings += self.item.artist_labels()
for after in ( for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
"in art",
"in portrait paintings",
"in landscape paintings",
"in culture",
"in popular culture",
"in painting",
"in 1",
"in 2",
"looking at viewer",
):
pos = lc_cat.find(after) pos = lc_cat.find(after)
# don't highlight "1512 in art" # don't highlight "1512 in art"
if pos == -1 or cat[: pos - 1].isdigit(): if pos == -1 or cat[:pos - 1].isdigit():
continue continue
return [(True, cat[:pos]), (False, cat[pos:])] return [(True, cat[:pos]), (False, cat[pos:])]
for before in ( for before in ('paintings of', 'portraits of', 'landscapes of',
"paintings of", 'portraits with', 'paintings with', 'paintings depicting',
"portraits of", 'portraits depicting', 'landscapes depicting', 'works about'):
"landscapes of",
"portraits with",
"paintings with",
"paintings depicting",
"portraits depicting",
"landscapes depicting",
"works about",
):
pos = lc_cat.find(before) pos = lc_cat.find(before)
if pos == -1: if pos == -1:
continue continue
pos += len(before) pos += len(before)
for by_ending in by_endings: for by_ending in by_endings:
ending = " by " + by_ending ending = ' by ' + by_ending
if lc_cat.endswith(ending): if lc_cat.endswith(ending):
return [ return [(False, cat[:pos]),
(False, cat[:pos]), (True, cat[pos:-len(ending)]),
(True, cat[pos : -len(ending)]), (False, cat[-len(ending):])]
(False, cat[-len(ending) :]),
]
return [(False, cat[:pos]), (True, cat[pos:])] return [(False, cat[:pos]), (True, cat[pos:])]
pos = lc_cat.find("of ") pos = lc_cat.find('of ')
if pos != -1: if pos != -1:
return [(True, cat[:pos]), (False, cat[pos:])] return [(True, cat[:pos]), (False, cat[pos:])]

View file

@ -1,43 +1,36 @@
"""Wikimedia Commons API call."""
from . import mediawiki, utils from . import mediawiki, utils
from .type import CallParams
commons_url = "https://commons.wikimedia.org/w/api.php" commons_url = 'https://commons.wikimedia.org/w/api.php'
page_size = 50 page_size = 50
def image_detail(filenames, thumbheight=None, thumbwidth=None):
def image_detail(
filenames: list[str] | str,
thumbheight: int | None = None,
thumbwidth: int | None = None,
) -> dict[str, dict[str, str]]:
"""Get image detail from Wikimedia Commons."""
if not isinstance(filenames, list): if not isinstance(filenames, list):
filenames = [filenames] filenames = [filenames]
if not filenames: if not filenames:
return {} return {}
params: CallParams = { params = {
"action": "query", 'action': 'query',
"prop": "imageinfo", 'prop': 'imageinfo',
"iiprop": "url", 'iiprop': 'url',
} }
if thumbheight is not None: if thumbheight is not None:
params["iiurlheight"] = thumbheight params['iiurlheight'] = thumbheight
if thumbwidth is not None: if thumbwidth is not None:
params["iiurlwidth"] = thumbwidth params['iiurlwidth'] = thumbwidth
images = {} images = {}
for cur in utils.chunk(filenames, page_size): for cur in utils.chunk(filenames, page_size):
call_params = params.copy() call_params = params.copy()
call_params["titles"] = "|".join(f"File:{f}" for f in cur) call_params['titles'] = '|'.join(f'File:{f}' for f in cur)
r = mediawiki.api_post(call_params, api_url=commons_url) r = mediawiki.api_post(call_params, api_url=commons_url)
for image in r.json()["query"]["pages"]: for image in r.json()['query']['pages']:
filename = utils.drop_start(image["title"], "File:") filename = utils.drop_start(image['title'], 'File:')
images[filename] = image["imageinfo"][0] if "imageinfo" in image else None images[filename] = image['imageinfo'][0] if 'imageinfo' in image else None
return images return images

View file

@ -1,35 +1,21 @@
"""Interact with SQL database."""
import typing
import flask
import sqlalchemy
from sqlalchemy import create_engine, func from sqlalchemy import create_engine, func
from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.orm import scoped_session, sessionmaker
session = scoped_session(sessionmaker()) session = scoped_session(sessionmaker())
def init_db(db_url):
session.configure(bind=get_engine(db_url))
def get_engine(db_url: str) -> sqlalchemy.engine.base.Engine: def get_engine(db_url):
"""Create an engine object."""
return create_engine(db_url, pool_recycle=3600, pool_size=20, max_overflow=40) return create_engine(db_url, pool_recycle=3600, pool_size=20, max_overflow=40)
def init_app(app, echo=False):
db_url = app.config['DB_URL']
session.configure(bind=get_engine(db_url, echo=echo))
def init_db(db_url: str) -> None: @app.teardown_appcontext
"""Initialise database.""" def shutdown_session(exception=None):
session.configure(bind=get_engine(db_url)) # type:ignore session.remove()
def now_utc():
def init_app(app: flask.app.Flask) -> None: return func.timezone('utc', func.now())
"""Initialise database connection within flask app."""
db_url = app.config["DB_URL"]
session.configure(bind=get_engine(db_url)) # type: ignore
@app.teardown_appcontext # type: ignore
def shutdown_session(exception: Exception | None = None) -> None:
session.remove() # type: ignore
def now_utc() -> typing.Any:
"""Get current time in UTC."""
return func.timezone("utc", func.now())

View file

@ -1,55 +1,55 @@
"""Detroit Institute of Arts (Q1201549) - art museum in Detroit, Michigan.""" import requests
import lxml.html
import os import os
import re import re
import lxml.html re_url = re.compile(r'https?://www.dia.org/art/collection/object/(.+)$')
import requests
from .type import CatalogDict def get_html(url):
re_url = re.compile(r"https?://www.dia.org/art/collection/object/(.+)$")
def get_html(url: str) -> str | None:
"""Get HTML from web catalog."""
m = re_url.search(url) m = re_url.search(url)
if not m: if not m:
return None return
catalog_id = m.group(1).replace("/", "_") catalog_id = m.group(1).replace('/', '_')
filename = f"cache/dia_{catalog_id}.html" filename = f'cache/dia_{catalog_id}.html'
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename).read() html = open(filename).read()
else: else:
r = requests.get(url) r = requests.get(url)
html = r.text html = r.text
open(filename, "w").write(html) open(filename, 'w').write(html)
return html return html
def parse_html(html):
def parse_html(html: str) -> CatalogDict:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
keywords = [] keywords = []
for a in root.findall(".//a[@href]"): for a in root.findall('.//a[@href]'):
href = a.get("href") href = a.get('href')
assert href is not None if not href.startswith('/art/collection?keys='):
if not href.startswith("/art/collection?keys="): continue
keywords.append(a.text)
if False:
sidebar = root.find('.//aside[@id="sidebar"]')
h2_list = sidebar.findall('.//h2')
h2_keyword = next((h2 for h2 in h2_list if h2.text == 'Keywords'), None)
if not h2_keyword:
return {}
keyword_div = h2_keyword.getparent()
for a in keyword_div:
if a.tag != 'a':
continue continue
assert a.text
keywords.append(a.text) keywords.append(a.text)
return { return {
"institution": "Detroit Institute of Arts", 'institution': 'Detroit Institute of Arts',
"keywords": keywords, 'keywords': keywords,
} }
def get_catalog(url):
def get_catalog(url: str) -> CatalogDict | None:
"""Get catalog web page and extract keywords."""
html = get_html(url) html = get_html(url)
return parse_html(html) if html else None if html:
return parse_html(html)

View file

@ -1,48 +1,32 @@
"""Send mail to admin when an error happens."""
import logging import logging
from logging import Formatter
from logging.handlers import SMTPHandler from logging.handlers import SMTPHandler
from logging import Formatter
from flask import request, g
import flask PROJECT = 'depicts'
from flask import g, request
PROJECT = "depicts" class MatcherSMTPHandler(SMTPHandler):
def getSubject(self, record): # noqa: N802
subject = (f'{PROJECT} error: {record.exc_info[0].__name__}'
class MySMTPHandler(SMTPHandler):
"""Custom SMTP handler to change mail subject."""
def getSubject(self, record: logging.LogRecord) -> str:
"""Specify subject line for error mails."""
subject = (
f"{PROJECT} error: {record.exc_info[0].__name__}"
if (record.exc_info and record.exc_info[0]) if (record.exc_info and record.exc_info[0])
else f"{PROJECT} error: {record.pathname}:{record.lineno:d}" else f'{PROJECT} error: {record.pathname}:{record.lineno:d}')
)
if qid := getattr(g, "qid", None): if qid := getattr(g, 'qid', None):
subject += f" {qid}" subject += f' {qid}'
if label := getattr(g, "label", None): if label := getattr(g, 'label', None):
subject += f": {label}" subject += f': {label}'
return subject return subject
class RequestFormatter(Formatter): class RequestFormatter(Formatter):
"""Custom logging formatter to include request."""
def format(self, record): def format(self, record):
"""Record includes request."""
record.request = request record.request = request
return super().format(record) return super().format(record)
def setup_error_mail(app: flask.Flask) -> None: def setup_error_mail(app):
"""Send mail to admins when an error happens.""" formatter = RequestFormatter('''
formatter = RequestFormatter(
"""
Message type: {levelname} Message type: {levelname}
Location: {pathname:s}:{lineno:d} Location: {pathname:s}:{lineno:d}
Module: {module:s} Module: {module:s}
@ -54,17 +38,13 @@ def setup_error_mail(app: flask.Flask) -> None:
Message: Message:
{message:s} {message:s}
""", ''', style='{')
style="{",
)
mail_handler = MySMTPHandler( mail_handler = MatcherSMTPHandler(app.config['SMTP_HOST'],
app.config["SMTP_HOST"], app.config['MAIL_FROM'],
app.config["MAIL_FROM"], app.config['ADMINS'],
app.config["ADMINS"], app.name + ' error',
app.name + " error", timeout=30)
timeout=30,
)
mail_handler.setFormatter(formatter) mail_handler.setFormatter(formatter)
mail_handler.setLevel(logging.ERROR) mail_handler.setLevel(logging.ERROR)

View file

@ -1,40 +1,3 @@
"""Fixtures.""" def save_error():
return {"error":{"code":"failed-save","info":"The save has failed.","messages":[{"name":"wikibase-api-failed-save","parameters":[],"html":"The save has failed."},{"name":"wikimedia-globalblocking-ipblocked-range","parameters":["[//meta.wikimedia.org/wiki/User:Jon_Kolbert Jon Kolbert]","meta.wikimedia.org","[[m:NOP|Open Proxy]]: Colocation webhost, Contact [[m:Special:Contact/stewards|stewards]] if you are affected","04:21, 8 April 2020","04:21, 8 April 2023","78.129.222.14","78.129.128.0/17"],"html":"<p><b>Your IP address is in a range that has been <a href=\"https://meta.wikimedia.org/wiki/Special:MyLanguage/Global_blocks\" class=\"extiw\" title=\"m:Special:MyLanguage/Global blocks\">blocked on all Wikimedia Foundation wikis</a>.</b>\n</p><p>The block was made by <a class=\"external text\" href=\"https://meta.wikimedia.org/wiki/User:Jon_Kolbert\">Jon Kolbert</a> (meta.wikimedia.org).\nThe reason given is <i><a href=\"https://meta.wikimedia.org/wiki/NOP\" class=\"extiw\" title=\"m:NOP\">Open Proxy</a>: Colocation webhost, Contact <a href=\"https://meta.wikimedia.org/wiki/Special:Contact/stewards\" class=\"extiw\" title=\"m:Special:Contact/stewards\">stewards</a> if you are affected</i>.\n</p>\n<ul><li>Start of block: 04:21, 8 April 2020</li>\n<li>Expiry of block: 04:21, 8 April 2023</li></ul>\n<p>Your current IP address is 78.129.222.14 and the blocked range is 78.129.128.0/17.\nPlease include all above details in any queries you make.\n</p><p>If you believe you were blocked by mistake, you can find additional information and instructions in the <a href=\"https://meta.wikimedia.org/wiki/Special:MyLanguage/No_open_proxies\" class=\"extiw\" title=\"m:Special:MyLanguage/No open proxies\">No open proxies</a> global policy.\nOtherwise, to discuss the block please <a href=\"https://meta.wikimedia.org/wiki/Steward_requests/Global\" class=\"extiw\" title=\"m:Steward requests/Global\">post a request for review on Meta-Wiki</a> or send an email to the <a href=\"https://meta.wikimedia.org/wiki/Special:MyLanguage/Stewards\" class=\"extiw\" title=\"m:Special:MyLanguage/Stewards\">stewards</a> <a href=\"https://meta.wikimedia.org/wiki/Special:MyLanguage/OTRS\" class=\"extiw\" title=\"m:Special:MyLanguage/OTRS\">OTRS</a> queue at <kbd>stewards@wikimedia.org</kbd> including all above details.\n</p>"},{"name":"permissionserrors","parameters":[],"html":"Permission error"}],"docref":"See https://www.wikidata.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes."},"servedby":"mw1315"}
import typing
def save_error() -> dict[str, str | dict[str, typing.Any]]:
"""Save error reply."""
return {
"error": {
"code": "failed-save",
"info": "The save has failed.",
"messages": [
{
"name": "wikibase-api-failed-save",
"parameters": [],
"html": "The save has failed.",
},
{
"name": "wikimedia-globalblocking-ipblocked-range",
"parameters": [
"[//meta.wikimedia.org/wiki/User:Jon_Kolbert Jon Kolbert]",
"meta.wikimedia.org",
"[[m:NOP|Open Proxy]]: Colocation webhost, Contact [[m:Special:Contact/stewards|stewards]] if you are affected",
"04:21, 8 April 2020",
"04:21, 8 April 2023",
"78.129.222.14",
"78.129.128.0/17",
],
"html": '<p><b>Your IP address is in a range that has been <a href="https://meta.wikimedia.org/wiki/Special:MyLanguage/Global_blocks" class="extiw" title="m:Special:MyLanguage/Global blocks">blocked on all Wikimedia Foundation wikis</a>.</b>\n</p><p>The block was made by <a class="external text" href="https://meta.wikimedia.org/wiki/User:Jon_Kolbert">Jon Kolbert</a> (meta.wikimedia.org).\nThe reason given is <i><a href="https://meta.wikimedia.org/wiki/NOP" class="extiw" title="m:NOP">Open Proxy</a>: Colocation webhost, Contact <a href="https://meta.wikimedia.org/wiki/Special:Contact/stewards" class="extiw" title="m:Special:Contact/stewards">stewards</a> if you are affected</i>.\n</p>\n<ul><li>Start of block: 04:21, 8 April 2020</li>\n<li>Expiry of block: 04:21, 8 April 2023</li></ul>\n<p>Your current IP address is 78.129.222.14 and the blocked range is 78.129.128.0/17.\nPlease include all above details in any queries you make.\n</p><p>If you believe you were blocked by mistake, you can find additional information and instructions in the <a href="https://meta.wikimedia.org/wiki/Special:MyLanguage/No_open_proxies" class="extiw" title="m:Special:MyLanguage/No open proxies">No open proxies</a> global policy.\nOtherwise, to discuss the block please <a href="https://meta.wikimedia.org/wiki/Steward_requests/Global" class="extiw" title="m:Steward requests/Global">post a request for review on Meta-Wiki</a> or send an email to the <a href="https://meta.wikimedia.org/wiki/Special:MyLanguage/Stewards" class="extiw" title="m:Special:MyLanguage/Stewards">stewards</a> <a href="https://meta.wikimedia.org/wiki/Special:MyLanguage/OTRS" class="extiw" title="m:Special:MyLanguage/OTRS">OTRS</a> queue at <kbd>stewards@wikimedia.org</kbd> including all above details.\n</p>',
},
{
"name": "permissionserrors",
"parameters": [],
"html": "Permission error",
},
],
"docref": "See https://www.wikidata.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.",
},
"servedby": "mw1315",
}

View file

@ -1,30 +1,19 @@
"""Human.""" from .model import HumanItem
from . import mediawiki, wikibase
import re import re
from . import mediawiki, wikibase re_four_digits = re.compile(r'\b\d{4}\b')
from .model import HumanItem
from .type import HumanDict
re_four_digits = re.compile(r"\b\d{4}\b") re_iso_date = re.compile(r'\b\d{4}-\d{2}-\d{2}\b')
re_four_and_two = re.compile(r'\b(\d{2})(\d{2})[-](\d{2})\b')
re_catalog_number = re.compile(r'\b\d{4}[^\d]+\d+[^\d]+\d{4}\b')
re_iso_date = re.compile(r"\b\d{4}-\d{2}-\d{2}\b") def query(yob, yod):
re_four_and_two = re.compile(r"\b(\d{2})(\d{2})[-](\d{2})\b")
re_catalog_number = re.compile(r"\b\d{4}[^\d]+\d+[^\d]+\d{4}\b")
def query(yob: int, yod: int) -> list[HumanItem]:
"""Search for people with given birth and death years."""
if yod < yob: if yod < yob:
return [] return []
humans: list[HumanItem] = HumanItem.query.filter_by( # type: ignore return HumanItem.query.filter_by(yob=yob, yod=yod).all()
yob=yob, yod=yod
).all()
return humans
def get_items_from_name(name):
def get_items_from_name(name: str) -> list[HumanItem]:
"""Get people with name."""
found = [] found = []
m = re_four_and_two.search(name) m = re_four_and_two.search(name)
@ -43,29 +32,27 @@ def get_items_from_name(name: str) -> list[HumanItem]:
return found return found
def from_name(name):
def from_name(name: str) -> list[HumanDict]:
"""Find candidate items from name."""
candidates = get_items_from_name(name) candidates = get_items_from_name(name)
lookup = {str(item.qid): item for item in candidates} lookup = {item.qid: item for item in candidates}
qids = list(lookup.keys()) qids = list(lookup.keys())
found = [] found = []
for entity in mediawiki.get_entities_with_cache(qids, props="labels|descriptions"): for entity in mediawiki.get_entities_with_cache(qids, props='labels|descriptions'):
if "redirects" in entity or "missing" in entity: if 'redirects' in entity or 'missing' in entity:
continue continue
qid = entity["id"] qid = entity['id']
item = lookup[qid] item = lookup[qid]
i: HumanDict = { i = {
"qid": entity["id"], 'qid': entity['id'],
"year_of_birth": item.year_of_birth, 'year_of_birth': item.year_of_birth,
"year_of_death": item.year_of_death, 'year_of_death': item.year_of_death,
} }
label = wikibase.get_entity_label(entity) label = wikibase.get_entity_label(entity)
if label: if label:
i["label"] = label i['label'] = label
if "en" in entity.get("descriptions", {}): if 'en' in entity.get('descriptions', {}):
i["description"] = entity["descriptions"]["en"]["value"] i['description'] = entity['descriptions']['en']['value']
found.append(i) found.append(i)
found.sort(key=lambda i: i.get("label", "")) found.sort(key=lambda i: i.get('label', ''))
return found return found

View file

@ -1,25 +1,20 @@
"""Send email.""" from flask import current_app
import smtplib
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid from email.utils import formatdate, make_msgid
import smtplib
from flask import current_app def send_mail(subject, body):
def send_mail(subject: str, body: str) -> None:
"""Send email to site admin."""
app = current_app app = current_app
mail_to = app.config["ADMIN_EMAIL"] mail_to = app.config['ADMIN_EMAIL']
mail_from = app.config["MAIL_FROM"] mail_from = app.config['MAIL_FROM']
msg = MIMEText(body, "plain", "UTF-8") msg = MIMEText(body, 'plain', 'UTF-8')
msg["Subject"] = subject msg['Subject'] = subject
msg["To"] = mail_to msg['To'] = mail_to
msg["From"] = mail_from msg['From'] = mail_from
msg["Date"] = formatdate() msg['Date'] = formatdate()
msg["Message-ID"] = make_msgid() msg['Message-ID'] = make_msgid()
s = smtplib.SMTP(app.config["SMTP_HOST"]) s = smtplib.SMTP(app.config['SMTP_HOST'])
s.sendmail(mail_from, [mail_to], msg.as_string()) s.sendmail(mail_from, [mail_to], msg.as_string())
s.quit() s.quit()

View file

@ -1,149 +1,114 @@
"""Access MediaWiki API."""
import hashlib
import json
import os
import typing
import requests import requests
import os
from . import utils import json
import hashlib
from .category import Category from .category import Category
from .type import CallParams, Entity from . import utils
wikidata_url = "https://www.wikidata.org/w/api.php" wikidata_url = 'https://www.wikidata.org/w/api.php'
page_size = 50 page_size = 50
hosts = { hosts = {
"commons": "commons.wikimedia.org", 'commons': 'commons.wikimedia.org',
"enwiki": "en.wikipedia.org", 'enwiki': 'en.wikipedia.org',
"wikidata": "www.wikidata.org", 'wikidata': 'www.wikidata.org',
} }
def api_call(params, api_url=wikidata_url):
def api_call(params: CallParams, api_url: str = wikidata_url) -> requests.Response: call_params = {
"""Mediawiki API call.""" 'format': 'json',
call_params: CallParams = { 'formatversion': 2,
"format": "json",
"formatversion": 2,
**params, **params,
} }
r = requests.get(api_url, params=call_params, timeout=5) r = requests.get(api_url, params=call_params, timeout=5)
return r return r
def api_post(params, api_url=wikidata_url):
def api_post(params: CallParams, api_url: str = wikidata_url) -> requests.Response: call_params = {
call_params: CallParams = { 'format': 'json',
"format": "json", 'formatversion': 2,
"formatversion": 2,
**params, **params,
} }
r = requests.post(api_url, data=call_params, timeout=5) r = requests.post(api_url, data=call_params, timeout=5)
return r return r
def get_list(list_name, **params):
r = api_call({'action': 'query', 'list': list_name, **params})
return r.json()['query'][list_name]
def get_list(list_name: str, **params: str | int) -> list[dict[str, typing.Any]]: def get_entity(qid, redirects=False):
r = api_call({"action": "query", "list": list_name, **params}) json_data = api_call({'action': 'wbgetentities',
list_contents: list[dict[str, typing.Any]] = r.json()["query"][list_name] 'ids': qid,
return list_contents 'redirects': {True: 'yes', False: 'no'}[redirects]}).json()
def get_entity(qid: str, redirects: bool = False) -> Entity | None:
"""Get entity from wikibase."""
json_data = api_call(
{
"action": "wbgetentities",
"ids": qid,
"redirects": {True: "yes", False: "no"}[redirects],
}
).json()
try: try:
entity = list(json_data["entities"].values())[0] entity = list(json_data['entities'].values())[0]
except KeyError: except KeyError:
return None return
if "missing" not in entity: if 'missing' not in entity:
return typing.cast(Entity, entity) return entity
return None
def wbgetentities(ids, **params):
def wbgetentities(ids: typing.Iterable[str], **params: str | int) -> dict[str, Entity]:
"""Get entities from wikibase."""
if not ids: if not ids:
return {} return []
params = { params = {
"action": "wbgetentities", 'action': 'wbgetentities',
"ids": "|".join(ids), 'ids': '|'.join(ids),
**params, **params,
} }
ret: dict[str, Entity] = api_call(params).json()["entities"] return api_call(params).json()['entities']
return ret
def get_entities(ids, **params):
def get_entities(ids: typing.Iterable[str], **params: str | int) -> list[Entity]: entity_list = []
entity_list: list[Entity] = []
for cur in utils.chunk(ids, page_size): for cur in utils.chunk(ids, page_size):
entity_list += wbgetentities(cur, **params).values() entity_list += wbgetentities(cur, **params).values()
return entity_list return entity_list
def get_entities_dict(ids, **params):
def get_entities_dict(ids: str, **params: str | int) -> dict[str, Entity]:
entities = {} entities = {}
for cur in utils.chunk(ids, page_size): for cur in utils.chunk(ids, page_size):
entities.update(wbgetentities(cur, **params)) entities.update(wbgetentities(cur, **params))
return entities return entities
def get_entity_with_cache(qid, refresh=False):
def get_entity_with_cache(qid: str, refresh: bool = False) -> Entity | None: filename = f'cache/{qid}.json'
filename = f"cache/{qid}.json"
entity: Entity | None
if not refresh and os.path.exists(filename): if not refresh and os.path.exists(filename):
entity = json.load(open(filename)) entity = json.load(open(filename))
else: else:
entity = get_entity(qid, redirects=True) entity = get_entity(qid, redirects=True)
json.dump(entity, open(filename, "w"), indent=2) json.dump(entity, open(filename, 'w'), indent=2)
return entity return entity
def get_entities_with_cache(ids, **params):
md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest()
def get_entities_with_cache(ids: list[str], **params: typing.Any) -> list[Entity]: filename = f'cache/entities_{md5}.json'
md5 = hashlib.md5(" ".join(ids).encode("utf-8")).hexdigest()
entity_list: list[Entity]
filename = f"cache/entities_{md5}.json"
if os.path.exists(filename): if os.path.exists(filename):
entity_list = json.load(open(filename)) entity_list = json.load(open(filename))
else: else:
entity_list = get_entities(ids, **params) entity_list = get_entities(ids, **params)
json.dump(entity_list, open(filename, "w"), indent=2) json.dump(entity_list, open(filename, 'w'), indent=2)
return entity_list return entity_list
def get_entities_dict_with_cache(all_ids, **params):
def get_entities_dict_with_cache(
all_ids: list[str], **params: typing.Any
) -> dict[str, Entity]:
entities = {} entities = {}
for ids in utils.chunk(all_ids, page_size): for ids in utils.chunk(all_ids, page_size):
md5 = hashlib.md5(" ".join(ids).encode("utf-8")).hexdigest() md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest()
filename = f"cache/entities_dict_{md5}.json" filename = f'cache/entities_dict_{md5}.json'
if os.path.exists(filename): if os.path.exists(filename):
entities.update(json.load(open(filename))) entities.update(json.load(open(filename)))
continue continue
cur = wbgetentities(ids, **params) cur = wbgetentities(ids, **params)
json.dump(cur, open(filename, "w"), indent=2) json.dump(cur, open(filename, 'w'), indent=2)
entities.update(cur) entities.update(cur)
return entities return entities
def mediawiki_query(titles, params, site):
Page = dict[str, typing.Any]
def mediawiki_query(titles: list[str], params: CallParams, site: str) -> list[Page]:
"""Mediawiki query."""
if not titles: if not titles:
return [] return []
@ -151,85 +116,74 @@ def mediawiki_query(titles: list[str], params: CallParams, site: str) -> list[Pa
# FIXME: switch to utils.chunk # FIXME: switch to utils.chunk
if len(titles) > page_size: if len(titles) > page_size:
titles = titles[:page_size] titles = titles[:page_size]
base: CallParams = { base = {
"format": "json", 'format': 'json',
"formatversion": 2, 'formatversion': 2,
"action": "query", 'action': 'query',
"continue": "", 'continue': '',
"titles": "|".join(titles), 'titles': '|'.join(titles),
} }
p = base.copy() p = base.copy()
p.update(params) p.update(params)
query_url = f"https://{hosts[site]}/w/api.php" query_url = f'https://{hosts[site]}/w/api.php'
r = requests.get(query_url, params=p) r = requests.get(query_url, params=p)
expect = "application/json; charset=utf-8" expect = 'application/json; charset=utf-8'
success = True success = True
if r.status_code != 200: if r.status_code != 200:
print("status code: {r.status_code}".format(r=r)) print('status code: {r.status_code}'.format(r=r))
success = False success = False
if r.headers["content-type"] != expect: if r.headers['content-type'] != expect:
print(f'content-type: {r.headers["content-type"]}') print('content-type: {r.headers[content-type]}'.format(r=r))
success = False success = False
assert success assert success
json_reply = r.json() json_reply = r.json()
if "query" not in json_reply: if 'query' not in json_reply:
print(r.url) print(r.url)
print(r.text) print(r.text)
pages: list[Page] = json_reply["query"]["pages"] return json_reply['query']['pages']
return pages
def get_content_and_categories(title, site):
def get_content_and_categories(title: str, site: str) -> tuple[str, list[str]]: params = {
"""Get article contents and categories.""" 'prop': 'revisions|categories',
params: CallParams = { 'clshow': '!hidden',
"prop": "revisions|categories", 'cllimit': 'max',
"clshow": "!hidden", 'rvprop': 'content',
"cllimit": "max",
"rvprop": "content",
} }
pages = mediawiki_query([title], params, site) pages = mediawiki_query([title], params, site)
assert len(pages) == 1 assert len(pages) == 1
page = pages[0] page = pages[0]
return (page["revisions"][0]["content"], page.get("categories", [])) return (page['revisions'][0]['content'], page.get('categories', []))
def host_from_site(site):
def host_from_site(site: str) -> str:
"""Host from site."""
return hosts[site] return hosts[site]
def process_cats(cats, site):
return [Category(cat['title'], site) for cat in cats]
def process_cats(cats: list[dict[str, str]], site: str) -> list[Category]: def get_categories(titles, site):
"""Process categories.""" params = {
return [Category(cat["title"], site) for cat in cats] 'prop': 'categories',
'clshow': '!hidden',
'cllimit': 'max',
def get_categories(titles: list[str], site: str) -> list[tuple[str, list[Category]]]:
"""Get categories for pages with given titles."""
params: CallParams = {
"prop": "categories",
"clshow": "!hidden",
"cllimit": "max",
} }
from_wiki = mediawiki_query(titles, params, site) from_wiki = mediawiki_query(titles, params, site)
title_and_cats = [] title_and_cats = []
for i in from_wiki: for i in from_wiki:
if "categories" not in i: if 'categories' not in i:
continue continue
cats = process_cats(i["categories"], site) cats = process_cats(i['categories'], site)
if not cats: if not cats:
continue continue
title_and_cats.append((i["title"], cats)) title_and_cats.append((i['title'], cats))
return title_and_cats return title_and_cats
def get_history(title, site):
def get_history(title: str, site: str) -> list[Page]: params = {
"""Get history of a page.""" 'prop': 'revisions',
params: CallParams = { 'rvlimit': 'max',
"prop": "revisions", 'rvprop': 'timestamp|user|comment|ids|content',
"rvlimit": "max", 'rvslots': 'main',
"rvprop": "timestamp|user|comment|ids|content",
"rvslots": "main",
} }
return mediawiki_query([title], params, site) return mediawiki_query([title], params, site)

View file

@ -1,27 +0,0 @@
from .category import Category
from .mediawiki import mediawiki_query
from .type import CallParams
def process_cats(cats: list[dict[str, str]], site: str) -> list[Category]:
"""Process categories."""
return [Category(cat["title"], site) for cat in cats]
def get_categories(titles: list[str], site: str) -> list[tuple[str, list[Category]]]:
"""Get categories for pages with given titles."""
params: CallParams = {
"prop": "categories",
"clshow": "!hidden",
"cllimit": "max",
}
from_wiki = mediawiki_query(titles, params, site)
title_and_cats = []
for i in from_wiki:
if "categories" not in i:
continue
cats = process_cats(i["categories"], site)
if not cats:
continue
title_and_cats.append((i["title"], cats))
return title_and_cats

View file

@ -1,181 +1,163 @@
import typing from sqlalchemy.ext.declarative import declarative_base
from datetime import timedelta from .database import session, now_utc
from . import wikibase, utils
from sqlalchemy.schema import Column, ForeignKey
from sqlalchemy.types import Integer, String, DateTime, Boolean
from sqlalchemy.orm import column_property, relationship, synonym
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.sql.expression import cast
from sqlalchemy.dialects import postgresql
from urllib.parse import quote from urllib.parse import quote
from sqlalchemy.dialects import postgresql
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import column_property, relationship, synonym
from sqlalchemy.schema import Column, ForeignKey
from sqlalchemy.sql.expression import cast
from sqlalchemy.types import Boolean, DateTime, Integer, String
from . import utils, wikibase
from .database import now_utc, session
from .type import Claims, Entity
Base = declarative_base() Base = declarative_base()
Base.query = session.query_property() # type: ignore Base.query = session.query_property()
class User(Base): class User(Base):
__tablename__ = "user" __tablename__ = 'user'
id = Column(Integer, primary_key=True, autoincrement=False) id = Column(Integer, primary_key=True, autoincrement=False)
username = Column(String, unique=True) username = Column(String, unique=True)
options = Column(postgresql.JSON) options = Column(postgresql.JSON)
first_seen = Column(DateTime, default=now_utc()) first_seen = Column(DateTime, default=now_utc())
is_admin = Column(Boolean, default=False) is_admin = Column(Boolean, default=False)
class DepictsItem(Base): class DepictsItem(Base):
__tablename__ = "depicts" __tablename__ = 'depicts'
item_id = Column(Integer, primary_key=True, autoincrement=False) item_id = Column(Integer, primary_key=True, autoincrement=False)
label = Column(String) label = Column(String)
description = Column(String) description = Column(String)
count = Column(Integer) count = Column(Integer)
qid = column_property("Q" + cast(item_id, String)) qid = column_property('Q' + cast(item_id, String))
db_alt_labels = relationship( db_alt_labels = relationship('DepictsItemAltLabel',
"DepictsItemAltLabel",
collection_class=set, collection_class=set,
cascade="save-update, merge, delete, delete-orphan", cascade='save-update, merge, delete, delete-orphan',
backref="item", backref='item')
) alt_labels = association_proxy('db_alt_labels', 'alt_label')
alt_labels = association_proxy("db_alt_labels", "alt_label") # type: ignore
class DepictsItemAltLabel(Base): class DepictsItemAltLabel(Base):
__tablename__ = "depicts_alt_label" __tablename__ = 'depicts_alt_label'
item_id = Column( item_id = Column(Integer,
Integer, ForeignKey("depicts.item_id"), primary_key=True, autoincrement=False ForeignKey('depicts.item_id'),
) primary_key=True,
autoincrement=False)
alt_label = Column(String, primary_key=True) alt_label = Column(String, primary_key=True)
def __init__(self, alt_label: str) -> None: def __init__(self, alt_label):
self.alt_label = alt_label self.alt_label = alt_label
class Item(Base): class Item(Base):
__tablename__ = "item" __tablename__ = 'item'
item_id = Column(Integer, primary_key=True, autoincrement=False) item_id = Column(Integer, primary_key=True, autoincrement=False)
# label = Column(String) # column removed 2019-12-18 # label = Column(String) # column removed 2019-12-18
entity = Column(postgresql.JSON) entity = Column(postgresql.JSON)
lastrevid = Column(Integer, nullable=True, unique=True) lastrevid = Column(Integer, nullable=True, unique=True)
modified = Column(DateTime, nullable=True) modified = Column(DateTime, nullable=True)
is_artwork = Column(Boolean, nullable=False, default=False) is_artwork = Column(Boolean, nullable=False, default=False)
qid = column_property("Q" + cast(item_id, String)) qid = column_property('Q' + cast(item_id, String))
@property def image_count(self):
def claims(self) -> Claims: p18 = self.entity['claims'].get('P18')
return typing.cast(Entity, self.entity)["claims"]
def image_count(self) -> int:
p18 = self.claims.get("P18")
return len(p18) if p18 else 0 return len(p18) if p18 else 0
def image_filename(self) -> str | None: def image_filename(self):
p18 = self.claims.get("P18") p18 = self.entity['claims'].get('P18')
if not p18: if not p18:
return None return
try: try:
filename: str = p18[0]["mainsnak"]["datavalue"]["value"] return p18[0]['mainsnak']['datavalue']['value']
return filename
except KeyError: except KeyError:
return None return
@property @property
def label(self) -> str | None: def label(self):
return wikibase.get_entity_label(typing.cast(Entity, self.entity)) return wikibase.get_entity_label(self.entity)
@property @property
def artist(self) -> str | None: def artist(self):
v = wikibase.first_datavalue(typing.cast(Entity, self.entity), "P170") v = wikibase.first_datavalue(self.entity, 'P170')
assert isinstance(v, dict) if not v:
return v["id"] if v else None return
return v['id']
@property @property
def depicts(self) -> list[str]: def depicts(self):
return self.linked_qids("P180") return self.linked_qids('P180')
@property @property
def instance_of(self) -> list[str]: def instance_of(self):
return self.linked_qids("P31") return self.linked_qids('P31')
def linked_qids(self, prop: str) -> list[str]: def linked_qids(self, prop):
values = typing.cast(Entity, self.entity)["claims"].get(prop) or [] values = self.entity['claims'].get(prop) or []
return [ return [v['mainsnak']['datavalue']['value']['id']
v["mainsnak"]["datavalue"]["value"]["id"]
for v in values for v in values
if "datavalue" in v["mainsnak"] if 'datavalue' in v['mainsnak']]
]
@property @property
def date(self) -> str | None: def date(self):
v = wikibase.first_datavalue(typing.cast(Entity, self.entity), "P571") v = wikibase.first_datavalue(self.entity, 'P571')
assert isinstance(v, dict) if v:
return utils.format_time(v["time"], v["precision"]) if v else None return utils.format_time(v['time'], v['precision'])
class Triple(Base): class Triple(Base):
__tablename__ = "triple" __tablename__ = 'triple'
subject_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) subject_id = Column(Integer,
ForeignKey('item.item_id'),
primary_key=True)
predicate_id = Column(Integer, primary_key=True, index=True) predicate_id = Column(Integer, primary_key=True, index=True)
object_id = Column(Integer, primary_key=True, index=True) object_id = Column(Integer, primary_key=True, index=True)
subject = relationship("Item", backref="triples") subject = relationship('Item', backref='triples')
class HumanItem(Base): class HumanItem(Base):
__tablename__ = "human" __tablename__ = 'human'
item_id = Column(Integer, primary_key=True, autoincrement=False) item_id = Column(Integer, primary_key=True, autoincrement=False)
year_of_birth = Column(Integer, nullable=False) year_of_birth = Column(Integer, nullable=False)
year_of_death = Column(Integer, nullable=False) year_of_death = Column(Integer, nullable=False)
age_at_death = column_property(year_of_death - year_of_birth) age_at_death = column_property(year_of_death - year_of_birth)
qid = column_property("Q" + cast(item_id, String)) qid = column_property('Q' + cast(item_id, String))
yob = synonym("year_of_birth")
yod = synonym("year_of_death")
yob = synonym('year_of_birth')
yod = synonym('year_of_death')
class Language(Base): class Language(Base):
__tablename__ = "language" __tablename__ = 'language'
item_id = Column(Integer, primary_key=True, autoincrement=False) item_id = Column(Integer, primary_key=True, autoincrement=False)
wikimedia_language_code = Column(String, index=True, unique=True) wikimedia_language_code = Column(String, index=True, unique=True)
en_label = Column(String, nullable=False) en_label = Column(String, nullable=False)
code = synonym("wikimedia_language_code") code = synonym('wikimedia_language_code')
label = synonym("en_label") label = synonym('en_label')
@classmethod @classmethod
def get_by_code(cls, code: str) -> "Language": def get_by_code(cls, code):
return cls.query.filter_by(wikimedia_language_code=code).one() # type: ignore return cls.query.filter_by(wikimedia_language_code=code).one()
class Edit(Base): class Edit(Base):
__tablename__ = "edit" __tablename__ = 'edit'
username = Column(String, primary_key=True) username = Column(String, primary_key=True)
artwork_id = Column(Integer, ForeignKey("item.item_id"), primary_key=True) artwork_id = Column(Integer, ForeignKey('item.item_id'), primary_key=True)
depicts_id = Column(Integer, ForeignKey("depicts.item_id"), primary_key=True) depicts_id = Column(Integer, ForeignKey('depicts.item_id'), primary_key=True)
timestamp = Column(DateTime, default=now_utc()) timestamp = Column(DateTime, default=now_utc())
lastrevid = Column(Integer, nullable=True) lastrevid = Column(Integer, nullable=True)
artwork_qid = column_property("Q" + cast(artwork_id, String)) artwork_qid = column_property('Q' + cast(artwork_id, String))
depicts_qid = column_property("Q" + cast(depicts_id, String)) depicts_qid = column_property('Q' + cast(depicts_id, String))
artwork = relationship("Item") artwork = relationship('Item')
depicts = relationship("DepictsItem") depicts = relationship('DepictsItem')
@property @property
def url_norm_username(self) -> str: def url_norm_username(self):
return quote(self.username.replace(" ", "_")) return quote(self.username.replace(' ', '_'))
@property @property
def user_wikidata_url(self) -> str: def user_wikidata_url(self):
return "https://www.wikidata.org/wiki/User:" + self.url_norm_username return 'https://www.wikidata.org/wiki/User:' + self.url_norm_username
class WikidataQuery(Base): class WikidataQuery(Base):
__tablename__ = "wikidata_query" __tablename__ = 'wikidata_query'
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
start_time = Column(DateTime) start_time = Column(DateTime)
end_time = Column(DateTime) end_time = Column(DateTime)
@ -189,27 +171,27 @@ class WikidataQuery(Base):
endpoint = Column(String) endpoint = Column(String)
@property @property
def duration(self) -> timedelta: def duration(self):
assert self.start_time and self.end_time if self.end_time:
return self.end_time - self.start_time return self.end_time - self.start_time
@property @property
def display_seconds(self) -> str: def display_seconds(self):
return f"{self.duration.total_seconds():.1f}" return f'{self.duration.total_seconds():.1f}'
@property @property
def template(self) -> str | None: def template(self):
if not self.query_template: if not self.query_template:
return None return
t = self.query_template t = self.query_template
if t.startswith("query/"): if t.startswith('query/'):
t = t[6:] t = t[6:]
if t.endswith(".sparql"): if t.endswith('.sparql'):
t = t[:-7] t = t[:-7]
return t return t
@property @property
def bad(self) -> bool: def bad(self):
return bool(self.status_code and self.status_code != 200) return self.status_code and self.status_code != 200

View file

@ -1,57 +1,42 @@
"""Museo del Prado (Q160112) - Spanish national art museum in Madrid, Spain.""" import requests
import lxml.html
import os import os
import re import re
import typing
import lxml.html re_url = re.compile(r'www.museodelprado.es/(.+)$')
import requests
from .type import CatalogDict, EmptyDict def get_html(url):
catalog_id = re_url.search(url).group(1).replace('/', '_')
re_url = re.compile(r"www.museodelprado.es/(.+)$") filename = f'cache/museodelprado_{catalog_id}.html'
def get_html(url: str) -> str:
"""Get HTML from web catalog."""
assert (m := re_url.search(url))
catalog_id = m.group(1).replace("/", "_")
filename = f"cache/museodelprado_{catalog_id}.html"
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename).read() html = open(filename).read()
else: else:
r = requests.get(url) r = requests.get(url)
html = r.text html = r.text
open(filename, "w").write(html) open(filename, 'w').write(html)
return html return html
def parse_html(html):
def parse_html(html: str) -> CatalogDict | EmptyDict:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
keywords = [] keywords = []
for h2 in root.findall(".//h2"): for h2 in root.findall('.//h2'):
if not h2.text or h2.text.strip() != "Displayed objects": if not h2.text or h2.text.strip() != 'Displayed objects':
continue continue
div = h2.getparent() div = h2.getparent()
assert div is not None for keyword_span in div.findall('.//span[@property]'):
for keyword_span in div.findall(".//span[@property]"):
assert isinstance(keyword_span.text, str)
keywords.append(keyword_span.text) keywords.append(keyword_span.text)
if not keywords: if not keywords:
return typing.cast(EmptyDict, {}) return {}
return { return {
"institution": "Museo del Prado", 'institution': 'Museo del Prado',
"keywords": keywords, 'keywords': keywords,
} }
def get_catalog(url):
def get_catalog(url: str) -> CatalogDict | EmptyDict:
"""Get catalog web page and extract keywords."""
return parse_html(get_html(url)) return parse_html(get_html(url))

View file

@ -1,52 +1,37 @@
"""National Portrait Gallery (Q238587) - art museum in London, England.""" import requests
import lxml.html
import os import os
import re import re
import lxml.html re_url = re.compile(r'www.npg.org.uk/collections/search/(.+)$')
import requests
from .type import CatalogDict def get_html(url):
catalog_id = re_url.search(url).group(1).replace('/', '_')
re_url = re.compile(r"www.npg.org.uk/collections/search/(.+)$") filename = f'cache/npg_{catalog_id}.html'
def get_html(url: str) -> str:
"""Get HTML from web catalog."""
assert (m := re_url.search(url))
catalog_id = m.group(1).replace("/", "_")
filename = f"cache/npg_{catalog_id}.html"
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename).read() html = open(filename).read()
else: else:
r = requests.get(url) r = requests.get(url)
html = r.text html = r.text
open(filename, "w").write(html) open(filename, 'w').write(html)
return html return html
def parse_html(html):
def parse_html(html: str) -> CatalogDict:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
keywords = [ keywords = [a.text for a in root.findall('.//a[@href]')
a.text if 'subj=' in a.get('href')]
for a in root.findall(".//a[@href]")
if "subj=" in a.get("href") # type: ignore
]
skip = {"oil", "painting"} skip = {'oil', 'painting'}
keywords = [k for k in keywords if k.lower() not in skip] # type: ignore keywords = [k for k in keywords if k.lower() not in skip]
return { return {
"institution": "National Portrait Gallery", 'institution': 'National Portrait Gallery',
"keywords": keywords, # type: ignore 'keywords': keywords,
} }
def get_catalog(url):
def get_catalog(url: str) -> CatalogDict:
"""Get catalog web page and extract keywords."""
return parse_html(get_html(url)) return parse_html(get_html(url))

View file

@ -1,80 +1,47 @@
"""Pagination."""
import typing
from math import ceil from math import ceil
from flask import request, url_for
from flask import Flask, request, url_for
T = typing.TypeVar("T")
class Pagination(object): class Pagination(object):
"""Pagination.""" def __init__(self, page, per_page, total_count):
page: int
per_page: int
total_count: int
def __init__(self, page: int, per_page: int, total_count: int) -> None:
"""Init."""
self.page = page self.page = page
self.per_page = per_page self.per_page = per_page
self.total_count = total_count self.total_count = total_count
@property @property
def pages(self) -> int: def pages(self):
"""Page count."""
return int(ceil(self.total_count / float(self.per_page))) return int(ceil(self.total_count / float(self.per_page)))
@property @property
def has_prev(self) -> bool: def has_prev(self):
"""Has previous page."""
return self.page > 1 return self.page > 1
@property @property
def has_next(self) -> bool: def has_next(self):
"""Has next page."""
return self.page < self.pages return self.page < self.pages
def slice(self, items: list[T]) -> list[T]: def slice(self, items):
"""Slice of items for the current page.""" first = ((self.page - 1) * self.per_page)
first = (self.page - 1) * self.per_page
last = self.page * self.per_page last = self.page * self.per_page
return items[first:last] return items[first:last]
def iter_pages( def iter_pages(self, left_edge=2, left_current=6,
self, right_current=6, right_edge=2):
left_edge: int = 2,
left_current: int = 6,
right_current: int = 6,
right_edge: int = 2,
) -> typing.Iterator[int | None]:
"""Iterate page numbers."""
last = 0 last = 0
for num in range(1, self.pages + 1): for num in range(1, self.pages + 1):
if ( if num <= left_edge or \
num <= left_edge (num > self.page - left_current - 1 and \
or ( num < self.page + right_current) or \
num > self.page - left_current - 1 num > self.pages - right_edge:
and num < self.page + right_current
)
or num > self.pages - right_edge
):
if last + 1 != num: if last + 1 != num:
yield None yield None
yield num yield num
last = num last = num
def url_for_other_page(page):
def url_for_other_page(page: int) -> str:
"""Make URL for other page."""
assert request.view_args is not None and request.endpoint
args = request.view_args.copy() args = request.view_args.copy()
args.update(request.args) args.update(request.args)
args["page"] = page args['page'] = page
return url_for(request.endpoint, **args) return url_for(request.endpoint, **args)
def init_pager(app):
def init_pager(app: Flask) -> None: app.jinja_env.globals['url_for_other_page'] = url_for_other_page
"""Initialise pager."""
app.jinja_env.globals["url_for_other_page"] = url_for_other_page

View file

@ -1,25 +1,16 @@
"""Enable Python requests that ignores bad HTTPS certificates."""
import typing
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context from requests.packages.urllib3.util.ssl_ import create_urllib3_context
CIPHERS = "DEFAULT@SECLEVEL=1" CIPHERS = 'DEFAULT@SECLEVEL=1'
class HTTPSAdapter(HTTPAdapter): class HTTPSAdapter(HTTPAdapter):
"""HTTPS Adapter subclass.""" def init_poolmanager(self, *args, **kwargs):
def init_poolmanager(self, *args: typing.Any, **kwargs: typing.Any) -> None:
"""Init pool manager."""
context = create_urllib3_context(ciphers=CIPHERS) context = create_urllib3_context(ciphers=CIPHERS)
kwargs["ssl_context"] = context kwargs['ssl_context'] = context
return super().init_poolmanager(*args, **kwargs) # type: ignore return super().init_poolmanager(*args, **kwargs)
def get(*args, **kwargs):
def get(*args: typing.Any, **kwargs: typing.Any) -> requests.Response:
s = requests.Session() s = requests.Session()
s.mount("https://", HTTPSAdapter()) s.mount('https://', HTTPSAdapter())
return s.get(*args, **kwargs, verify=False) return s.get(*args, **kwargs, verify=False)

View file

@ -1,51 +1,34 @@
"""Rijksmuseuma (Q190804) - museum in Amsterdam, Netherlands.""" import requests
import lxml.html
import os import os
import re import re
import lxml.html re_url = re.compile(r'^https://www.rijksmuseum.nl/(?:nl/collectie|en/collection)/([^/]+)$')
import requests
from .type import CatalogDict def get_html(catalog_id):
filename = f'cache/rijksmuseum_{catalog_id}.html'
re_url = re.compile( en_url = 'https://www.rijksmuseum.nl/en/collection/' + catalog_id
r"^https://www.rijksmuseum.nl/(?:nl/collectie|en/collection)/([^/]+)$"
)
def get_html(catalog_id: str) -> str:
"""Get HTML from web catalog."""
filename = f"cache/rijksmuseum_{catalog_id}.html"
en_url = "https://www.rijksmuseum.nl/en/collection/" + catalog_id
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename).read() html = open(filename).read()
else: else:
r = requests.get(en_url) r = requests.get(en_url)
html = r.text html = r.text
open(filename, "w").write(html) open(filename, 'w').write(html)
return html return html
def parse_html(html):
def parse_html(html: str) -> CatalogDict:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
keywords = [ keywords = [a.text for a in root.findall('.//a[@href]')
a.text if 'f.classification.iconClassDescription.sort' in a.get('href')]
for a in root.findall(".//a[@href]")
if "f.classification.iconClassDescription.sort" in a.get("href") # type: ignore
]
return { return {
"institution": "Rijksmuseum", 'institution': 'Rijksmuseum',
"keywords": keywords, # type: ignore 'keywords': keywords,
} }
def get_catalog(url):
def get_catalog(url: str) -> CatalogDict: catalog_id = re_url.search(url).group(1)
"""Get catalog web page and extract keywords."""
assert (m := re_url.search(url))
catalog_id = m.group(1)
return parse_html(get_html(catalog_id)) return parse_html(get_html(catalog_id))

View file

@ -1,59 +1,45 @@
"""Smithsonian American Art Museum (Q1192305) - fine arts museum in Washington, D.C.""" import requests
import lxml.html
import json import json
import os import os
import typing
import lxml.html def get_html(saam_id):
import requests filename = f'cache/saam_{saam_id}.html'
url = 'http://americanart.si.edu/collections/search/artwork/'
from .type import CatalogDict, EmptyDict
def get_html(saam_id: str | int) -> str:
"""Get HTML from web catalog."""
filename = f"cache/saam_{saam_id}.html"
url = "http://americanart.si.edu/collections/search/artwork/"
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename).read() html = open(filename).read()
else: else:
r = requests.get(url, params={"id": saam_id}) r = requests.get(url, params={'id': saam_id})
html = r.text html = r.text
open(filename, "w").write(html) open(filename, 'w').write(html)
return html return html
def parse_html(html):
def parse_html(html: str) -> dict[str, typing.Any] | None:
"""Parse HTML and extract keywords."""
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
ld_json = root.findtext('.//script[@type="application/ld+json"]') ld_json = root.findtext('.//script[@type="application/ld+json"]')
if ld_json is None: if ld_json is None:
return {"ld": {}, "keywords": []} return {'ld': {}, 'keywords': []}
ld = json.loads(ld_json) ld = json.loads(ld_json)
ul = root.find('.//ul[@class="ontology-list"]') ul = root.find('.//ul[@class="ontology-list"]')
if ul is None: if ul is None:
return None return
assert ul.tag == "ul" assert ul.tag == 'ul'
keywords = [li.text for li in ul] keywords = [li.text for li in ul]
return {"ld": ld, "keywords": keywords} return {'ld': ld, 'keywords': keywords}
def get_catalog(saam_id):
def get_catalog(saam_id: int | str) -> CatalogDict | EmptyDict:
"""Get catalog web page and extract keywords."""
data = parse_html(get_html(saam_id)) data = parse_html(get_html(saam_id))
empty: EmptyDict = {}
if not data: if not data:
return empty return {}
ret: CatalogDict = { ret = {
"institution": "Smithsonian American Art Museum", 'institution': 'Smithsonian American Art Museum',
"keywords": [],
} }
if data["keywords"]: if data['keywords']:
ret["keywords"] = data["keywords"] ret['keywords'] = data['keywords']
if "description" in data["ld"]: if 'description' in data['ld']:
ret["description"] = data["ld"]["description"] ret['description'] = data['ld']['description']
return ret if "description" in ret or "keywords" in ret else empty return ret if 'description' in ret or 'keywords' in ret else {}

View file

@ -1,50 +0,0 @@
import typing
from typing import Required, TypedDict
Claims = dict[str, list[dict[str, typing.Any]]]
Sitelinks = dict[str, dict[str, typing.Any]]
class EmptyDict(TypedDict):
"""Empty dict."""
pass
class Entity(TypedDict, total=False):
"""Wikidata Entity."""
id: str
labels: dict[str, typing.Any]
descriptions: dict[str, typing.Any]
claims: Claims
lastrevid: int
sitelinks: Sitelinks
modified: str
redirects: dict[str, typing.Any]
aliases: dict[str, list[dict[str, typing.Any]]]
class CatalogDict(TypedDict, total=False):
"""Catalog record from institution web site."""
institution: str
url: str
ids: set[str]
detail: list[dict[str, str]]
description: str
keywords: list[str]
CallParams = dict[str, str | int]
class HumanDict(TypedDict, total=False):
"""Human."""
qid: Required[str]
year_of_birth: Required[int]
year_of_death: Required[int]
label: str
description: str

View file

@ -1,121 +1,91 @@
"""Various utility functions."""
import typing
import urllib.parse
from datetime import datetime
from itertools import islice
import inflect
from flask import request from flask import request
from itertools import islice
from datetime import datetime
import urllib.parse
import inflect
hosts = { hosts = {
"commons": "commons.wikimedia.org", 'commons': 'commons.wikimedia.org',
"enwiki": "en.wikipedia.org", 'enwiki': 'en.wikipedia.org',
"wikidata": "www.wikidata.org", 'wikidata': 'www.wikidata.org',
} }
engine = inflect.engine() engine = inflect.engine()
skip_names = {"National Gallery"} skip_names = {
'National Gallery'
}
T = typing.TypeVar("T") def ordinal(n):
return "%d%s" % (n, 'tsnrhtdd'[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10::4])
def chunk(it, size):
def ordinal(n: int) -> str:
"""Convert number to ordinal."""
return "%d%s" % (n, "tsnrhtdd"[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10 :: 4])
def chunk(it: typing.Iterable[T], size: int) -> typing.Iterator[tuple[T, ...]]:
"""Split an iterable into chunks of the given size."""
it = iter(it) it = iter(it)
return iter(lambda: tuple(islice(it, size)), ()) return iter(lambda: tuple(islice(it, size)), ())
def drop_start(s, start):
def drop_start(s: str, start: str) -> str:
"""Remove string prefix, otherwise throw an error."""
assert s.startswith(start) assert s.startswith(start)
return s[len(start) :] return s[len(start):]
def drop_category_ns(s):
return drop_start(s, 'Category:')
def drop_category_ns(s: str) -> str: def parse_sitelink(s, start):
"""Remove 'Category:' from start of string.""" return urllib.parse.unquote(drop_start(s, start)).replace('_', ' ')
return drop_start(s, "Category:")
def word_contains_letter(word):
def parse_sitelink(s: str, start: str) -> str:
"""Extract title from sitelink."""
return urllib.parse.unquote(drop_start(s, start)).replace("_", " ")
def word_contains_letter(word: str) -> bool:
"""Word contains letter."""
return any(c.isalpha() for c in word) return any(c.isalpha() for c in word)
def also_singular(name):
def also_singular(name: str) -> list[str]:
names = also_singular_main(name) names = also_singular_main(name)
extra = [] extra = []
for n in names: for n in names:
words = set(n.lower().split()) words = set(n.lower().split())
for word in "girl", "boy": for word in 'girl', 'boy':
if word in words: if word in words:
extra.append(word) extra.append(word)
if {"female", "females", "women"} & words: if {'female', 'females', 'women'} & words:
extra.append("woman") extra.append('woman')
if {"male", "males", "men"} & words: if {'male', 'males', 'men'} & words:
extra.append("man") extra.append('man')
return [n for n in names + extra if n not in skip_names] return [n for n in names + extra if n not in skip_names]
def also_singular_main(name):
def also_singular_main(name: str) -> list[str]: '''
"""
given a singular name return a list of both the plural and singular versions given a singular name return a list of both the plural and singular versions
just return the name if it isn't singular just return the name if it isn't singular
""" '''
singular = engine.singular_noun(name.strip("|")) singular = engine.singular_noun(name.strip('|'))
if not singular: if not singular:
return [name] return [name]
n, s = name.lower(), singular.lower() n, s = name.lower(), singular.lower()
if ( if (n == s or
n == s n.replace('paintings', '') == s.replace('painting', '') or
or n.replace("paintings", "") == s.replace("painting", "") n == 'venus' and s == 'venu'):
or n == "venus"
and s == "venu"
):
return [name] return [name]
return [name, singular] return [name, singular]
def wiki_url(title, site, ns=None):
def wiki_url(title: str, site: str, ns: str | None = None) -> str:
"""Build mediawiki URL for given title."""
host = hosts[site] host = hosts[site]
url_ns = ns + ":" if ns else "" url_ns = ns + ':' if ns else ''
assert title if not title:
return
if title[0].islower(): if title[0].islower():
title = title[0].upper() + title[1:] title = title[0].upper() + title[1:]
return ( return f'https://{host}/wiki/' + url_ns + urllib.parse.quote(title.replace(' ', '_'))
f"https://{host}/wiki/" + url_ns + urllib.parse.quote(title.replace(" ", "_"))
)
def get_int_arg(name):
def get_int_arg(name: str) -> int | None:
"""Get an request arg and convert to integer."""
if name in request.args and request.args[name].isdigit(): if name in request.args and request.args[name].isdigit():
return int(request.args[name]) return int(request.args[name])
else:
return None
def format_time(time_value, precision):
def format_time(time_value: str, precision: int) -> str:
"""Format time with given precision."""
# FIXME handle dates like '1965-04-00T00:00:00Z' # FIXME handle dates like '1965-04-00T00:00:00Z'
# FIXME handle BC dates properly, "120 B.C." instead of "-120" # FIXME handle BC dates properly, "120 B.C." instead of "-120"
year = None year = None
if "-00" in time_value: if '-00' in time_value:
# can't be represented as python datetime # can't be represented as python datetime
year = int(time_value[: time_value.find("-", 1)]) year = int(time_value[:time_value.find('-', 1)])
else: else:
try: try:
t = datetime.strptime(time_value[1:], "%Y-%m-%dT%H:%M:%SZ") t = datetime.strptime(time_value[1:], "%Y-%m-%dT%H:%M:%SZ")
@ -126,10 +96,10 @@ def format_time(time_value: str, precision: int) -> str:
if precision == 9: if precision == 9:
return str(year) return str(year)
if precision == 8: if precision == 8:
return f"{year}s" return f'{year}s'
if precision == 7: if precision == 7:
return f"{ordinal((year // 100) + 1)} century" return f'{ordinal((year // 100) + 1)} century'
if precision == 6: if precision == 6:
return f"{ordinal((year // 1000) + 1)} millennium" return f'{ordinal((year // 1000) + 1)} millennium'
return time_value return time_value

View file

@ -1,345 +1,147 @@
import hashlib from depicts import (wikibase, relaxed_ssl, saam, dia, rijksmuseum, npg,
import os.path museodelprado, barnesfoundation)
import lxml.html
import requests import requests
import requests.exceptions import requests.exceptions
import lxml.html
import os.path
import hashlib
from depicts import ( user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
barnesfoundation,
dia,
museodelprado,
npg,
relaxed_ssl,
rijksmuseum,
saam,
wikibase,
)
from .type import CatalogDict, Entity
user_agent = "Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0"
table = { table = {
"P347": ("Joconde ID", "https://www.pop.culture.gouv.fr/notice/joconde/$1"), 'P347': ('Joconde ID', 'https://www.pop.culture.gouv.fr/notice/joconde/$1'),
"P350": ("RKDimages ID", "https://rkd.nl/explore/images/$1"), 'P350': ('RKDimages ID', 'https://rkd.nl/explore/images/$1'),
"P1212": ( 'P1212': ('Atlas ID', 'http://cartelen.louvre.fr/cartelen/visite?srv=car_not_frame&idNotice=$1'),
"Atlas ID", 'P1428': ('Lost Art ID', 'http://www.lostart.de/EN/Verlust/$1'),
"http://cartelen.louvre.fr/cartelen/visite?srv=car_not_frame&idNotice=$1", 'P1679': ('Art UK artwork ID', 'https://artuk.org/discover/artworks/$1'),
), 'P1726': ('Florentine musea Inventario 1890 ID', 'http://www.polomuseale.firenze.it/inv1890/scheda.asp?position=1&ninv=$1'),
"P1428": ("Lost Art ID", "http://www.lostart.de/EN/Verlust/$1"), 'P2014': ('Museum of Modern Art work ID', 'http://www.moma.org/collection/works/$1'),
"P1679": ("Art UK artwork ID", "https://artuk.org/discover/artworks/$1"), 'P2092': ('Bildindex der Kunst und Architektur ID', 'https://www.bildindex.de/document/obj$1'),
"P1726": ( 'P2108': ('Kunstindeks Danmark artwork ID', 'https://www.kulturarv.dk/kid/VisVaerk.do?vaerkId=$1'),
"Florentine musea Inventario 1890 ID", 'P2242': ('Florentine musea catalogue ID', 'http://www.polomuseale.firenze.it/catalogo/scheda.asp?nctn=$1&value=1'),
"http://www.polomuseale.firenze.it/inv1890/scheda.asp?position=1&ninv=$1", 'P2282': ('Groeningemuseum work PID', 'http://groeningemuseum.be/collection/work/id/$1'),
), 'P2344': ('AGORHA work ID', 'http://www.purl.org/inha/agorha/003/$1'),
"P2014": ( 'P2511': ('MSK Gent work PID', 'http://mskgent.be/collection/work/id/$1'),
"Museum of Modern Art work ID", 'P2539': ('Nationalmuseum Sweden artwork ID', 'http://collection.nationalmuseum.se/eMuseumPlus?service=ExternalInterface&module=collection&objectId=$1&viewType=detailView'),
"http://www.moma.org/collection/works/$1", 'P2582': ('J. Paul Getty Museum object ID', 'http://www.getty.edu/art/collection/objects/$1'),
), 'P3272': ('Zeri image ID', 'http://catalogo.fondazionezeri.unibo.it/scheda/opera/$1/'),
"P2092": ( 'P3293': ('BALaT object ID', 'http://balat.kikirpa.be/object/$1'),
"Bildindex der Kunst und Architektur ID", 'P3386': ('French Sculpture Census work ID', 'https://frenchsculpture.org/en/sculpture/$1'),
"https://www.bildindex.de/document/obj$1", 'P3467': ('Inventario Sculture - Polo Museale Fiorentino', 'http://www.polomuseale.firenze.it/invSculture/scheda.asp?position=1&ninv=$1'),
), 'P3504': ('Florentine Inventario Palatina art ID', 'http://www.polomuseale.firenze.it/invpalatina/scheda.asp?position=1&ninv=$1'),
"P2108": ( 'P3634': ('The Met object ID', 'http://www.metmuseum.org/art/collection/search/$1'),
"Kunstindeks Danmark artwork ID", 'P3711': ('Vanderkrogt.net Statues ID', 'http://vanderkrogt.net/statues/object.php?record=$1'),
"https://www.kulturarv.dk/kid/VisVaerk.do?vaerkId=$1", 'P3855': ('LombardiaBeniCulturali artwork ID', 'http://www.lombardiabeniculturali.it/opere-arte/schede/$1/'),
), 'P3929': ('V&A item ID', 'http://collections.vam.ac.uk/item/$1'),
"P2242": ( 'P4144': ('Athenaeum artwork ID', 'http://www.the-athenaeum.org/art/detail.php?id=$1'),
"Florentine musea catalogue ID", 'P4257': ('National Museums of Japan e-museum ID', 'http://www.emuseum.jp/detail/$1'),
"http://www.polomuseale.firenze.it/catalogo/scheda.asp?nctn=$1&value=1", 'P4373': ('National Trust Collections ID', 'http://www.nationaltrustcollections.org.uk/object/$1'),
), 'P4380': ('Sandrart.net artwork ID', 'http://ta.sandrart.net/-artwork-$1'),
"P2282": ( 'P4399': ('Enciclopédia Itaú Cultural ID', 'http://enciclopedia.itaucultural.org.br/$1'),
"Groeningemuseum work PID", 'P4525': ('MuIS object ID', 'http://opendata.muis.ee/object/$1'),
"http://groeningemuseum.be/collection/work/id/$1", 'P4564': ('Art Museum of Estonia artwork ID', 'https://digikogu.ekm.ee/oid-$1'),
), 'P4582': ('Kulturelles Erbe Köln object ID', 'https://www.kulturelles-erbe-koeln.de/documents/obj/$1'),
"P2344": ("AGORHA work ID", "http://www.purl.org/inha/agorha/003/$1"), 'P4610': ('ARTIC artwork ID', 'https://www.artic.edu/artworks/$1'),
"P2511": ("MSK Gent work PID", "http://mskgent.be/collection/work/id/$1"), 'P4611': ('LACMA ID', 'https://collections.lacma.org/node/$1'),
"P2539": ( 'P4625': ('Museum of Fine Arts, Boston object ID', 'https://www.mfa.org/collections/object/$1'),
"Nationalmuseum Sweden artwork ID", 'P4643': ('Philadelphia Museum of Art ID', 'http://www.philamuseum.org/collections/permanent/$1.html'),
"http://collection.nationalmuseum.se/eMuseumPlus?service=ExternalInterface&module=collection&objectId=$1&viewType=detailView", 'P4659': ("Musée d'Orsay artwork ID", 'http://www.musee-orsay.fr/en/collections/index-of-works/notice.html?nnumid=$1'),
), 'P4673': ('Museum of Fine Arts, Houston object ID', 'https://www.mfah.org/art/detail/$1'),
"P2582": ( 'P4674': ('Indianapolis Museum of Art artwork ID', 'http://collection.imamuseum.org/artwork/$1/'),
"J. Paul Getty Museum object ID", 'P4683': ('National Gallery of Art artwork ID', 'https://www.nga.gov/content/ngaweb/Collection/art-object-page.$1.html'),
"http://www.getty.edu/art/collection/objects/$1", 'P4684': ('National Gallery of Victoria artwork ID', 'https://www.ngv.vic.gov.au/explore/collection/work/$1/'),
), 'P4686': ('Carnegie Museum of Art ID', 'https://collection.cmoa.org/objects/$1'),
"P3272": ( 'P4692': ('American Art Collaborative object ID', 'http://browse.americanartcollaborative.org/object/$1.html'),
"Zeri image ID", 'P4701': ('Google Arts & Culture asset ID', 'https://artsandculture.google.com/asset/wd/$1'),
"http://catalogo.fondazionezeri.unibo.it/scheda/opera/$1/", 'P4704': ('Smithsonian American Art Museum ID', 'https://americanart.si.edu/collections/search/artwork/?id=$1'),
), 'P4709': ('Barnes Foundation ID', 'https://collection.barnesfoundation.org/objects/$1/details'),
"P3293": ("BALaT object ID", "http://balat.kikirpa.be/object/$1"), 'P4712': ('Minneapolis Institute of Art artwork ID', 'https://collections.artsmia.org/art/$1'),
"P3386": ( 'P4713': ('Walters Art Museum ID', 'http://art.thewalters.org/detail/$1'),
"French Sculpture Census work ID", 'P4721': ('MuBE Virtual ID', 'http://mubevirtual.com.br/pt_br?Dados&area=ver&id=$1'),
"https://frenchsculpture.org/en/sculpture/$1", 'P4737': ('Solomon R. Guggenheim Foundation artwork ID', 'https://www.guggenheim.org/artwork/$1'),
), 'P4738': ('Yale Center for British Art artwork ID', 'http://collections.britishart.yale.edu/vufind/Record/$1'),
"P3467": ( 'P4739': ('Musée des Augustins artwork ID', 'https://www.augustins.org/fr/oeuvre/-/oeuvre/$1'),
"Inventario Sculture - Polo Museale Fiorentino", 'P4740': ('Brooklyn Museum artwork ID', 'https://www.brooklynmuseum.org/opencollection/objects/$1'),
"http://www.polomuseale.firenze.it/invSculture/scheda.asp?position=1&ninv=$1", 'P4761': ("Images d'Art artwork ID", 'http://art.rmngp.fr/en/library/artworks/$1'),
), 'P4764': ('Arcade artwork ID', 'http://www.culture.gouv.fr/public/mistral/arcade_fr?ACTION=CHERCHER&FIELD_1=REF&VALUE_1=$1'),
"P3504": ( 'P4814': ('Inventories of American Painting and Sculpture control number', 'https://siris-artinventories.si.edu/ipac20/ipac.jsp?&menu=search&index=.NW&term=$1'),
"Florentine Inventario Palatina art ID", 'P4905': ('KMSKA work PID', 'http://kmska.be/collection/work/id/$1'),
"http://www.polomuseale.firenze.it/invpalatina/scheda.asp?position=1&ninv=$1", 'P5210': ('National Gallery of Armenia work ID', 'http://www.gallery.am/en/database/item/$1/'),
), 'P5223': ('Information Center for Israeli Art artwork ID', 'http://museum.imj.org.il/artcenter/includes/item.asp?id=$1'),
"P3634": ("The Met object ID", "http://www.metmuseum.org/art/collection/search/$1"), 'P5265': ('Dordrechts Museum artwork ID', 'https://www.dordrechtsmuseum.nl/objecten/id/$1'),
"P3711": ( 'P5268': ('MNAV work ID', 'http://acervo.mnav.gub.uy/obras.php?q=ni:$1'),
"Vanderkrogt.net Statues ID", 'P5269': ('Web umenia work ID', 'https://www.webumenia.sk/dielo/$1'),
"http://vanderkrogt.net/statues/object.php?record=$1", 'P5407': ('MHK object ID', 'http://datenbank.museum-kassel.de/$1'),
), 'P5499': ('Boijmans work ID', 'https://www.boijmans.nl/en/collection/artworks/$1'),
"P3855": ( 'P5783': ('Cranach Digital Archive artwork ID', 'http://lucascranach.org/$1'),
"LombardiaBeniCulturali artwork ID", 'P5823': ('Belvedere object ID', 'https://digital.belvedere.at/objects/$1/'),
"http://www.lombardiabeniculturali.it/opere-arte/schede/$1/", 'P5891': ('Bpk-ID', 'http://www.bpk-images.de/id/$1'),
), 'P6004': ('Brasiliana Iconográfica ID', 'https://www.brasilianaiconografica.art.br/obras/$1/wd'),
"P3929": ("V&A item ID", "http://collections.vam.ac.uk/item/$1"), 'P6007': ('Salons ID', 'http://salons.musee-orsay.fr/index/notice/$1'),
"P4144": ( 'P6020': ("d'Art d'Art ! ID", 'https://www.france.tv/france-2/d-art-d-art/$1.html'),
"Athenaeum artwork ID", 'P6141': ('À nos grands hommes ID', 'https://anosgrandshommes.musee-orsay.fr/index.php/Detail/objects/$1'),
"http://www.the-athenaeum.org/art/detail.php?id=$1", 'P6152': ('National Portrait Gallery (United States) object ID', 'http://npg.si.edu/object/npg_$1'),
), 'P6238': ('Monument aux morts ID', 'https://monumentsmorts.univ-lille.fr/monument/$1/wd/'),
"P4257": ( 'P6239': ('IEC commemorative monument of Catalonia ID', 'https://monuments.iec.cat/fitxa.asp?id=$1'),
"National Museums of Japan e-museum ID", 'P6246': ('Paris Musées work ID', 'http://parismuseescollections.paris.fr/en/node/$1'),
"http://www.emuseum.jp/detail/$1", 'P6310': ('Muséosphère work ID', 'http://museosphere.paris.fr/oeuvres/$1'),
), 'P6332': ("Panorama de l'art ID", 'https://www.panoramadelart.com/$1'),
"P4373": ( 'P6355': ('MNAM artwork ID', 'https://collection.centrepompidou.fr/#/artwork/$1'),
"National Trust Collections ID", 'P6356': ('IHOI work ID', 'http://www.ihoi.org/app/photopro.sk/ihoi_icono/detail?docid=$1&lang=eng'),
"http://www.nationaltrustcollections.org.uk/object/$1", 'P6358': ('Musée Picasso artwork ID', 'https://www.navigart.fr/picassoparis/#/artwork/$1'),
), 'P6372': ('Interpol WOA artwork ID (OBSOLETE)', 'https://www.interpol.int/notice/search/woa/$1'),
"P4380": ("Sandrart.net artwork ID", "http://ta.sandrart.net/-artwork-$1"), 'P6374': ('MAMVP artwork ID', 'http://www.mam.paris.fr/en/online-collections#/artwork/$1'),
"P4399": ( 'P6489': ('Joan Miró Online Image Bank ID', 'https://www.successiomiro.com/catalogue/object/$1'),
"Enciclopédia Itaú Cultural ID", 'P6506': ('Eliseu Visconti Project ID', 'https://eliseuvisconti.com.br/obra/$1'),
"http://enciclopedia.itaucultural.org.br/$1", 'P6565': ('Musenor artwork ID', 'https://webmuseo.com/ws/musenor/app/collection/record/$1'),
), 'P6576': ('Art Fund artwork ID', 'https://www.artfund.org/supporting-museums/art-weve-helped-buy/artwork/$1/wd'),
"P4525": ("MuIS object ID", "http://opendata.muis.ee/object/$1"), 'P6595': ('Paintings by Salvador Dalí ID', 'https://www.salvador-dali.org/en/artwork/catalogue-raisonne/obra/$1/'),
"P4564": ("Art Museum of Estonia artwork ID", "https://digikogu.ekm.ee/oid-$1"), 'P6610': ('Ashmolean museum ID', 'http://collections.ashmolean.org/object/$1'),
"P4582": ( 'P6625': ('Salvador Dali Museum ID', 'http://archive.thedali.org/mwebcgi/mweb.exe?request=record;id=$1;type=101'),
"Kulturelles Erbe Köln object ID", 'P6629': ('Artcurial lot ID', 'https://www.artcurial.com/en/$1'),
"https://www.kulturelles-erbe-koeln.de/documents/obj/$1", 'P6631': ('Tainacan MHN ID', 'http://mhn.acervos.museus.gov.br/reserva-tecnica/$1'),
), 'P6633': ('Cini Foundation ID', 'http://arte.cini.it/Opere/$1'),
"P4610": ("ARTIC artwork ID", "https://www.artic.edu/artworks/$1"), 'P6643': ('TV Spielfilm series ID', 'https://www.tvspielfilm.de/serien/$1'),
"P4611": ("LACMA ID", "https://collections.lacma.org/node/$1"), 'P6738': ('Whitney Museum of American Art artwork ID', 'https://whitney.org/collection/works/$1'),
"P4625": ( 'P7229': ('Fundación Goya en Aragón ID', 'https://fundaciongoyaenaragon.es/obra/wd/$1'),
"Museum of Fine Arts, Boston object ID",
"https://www.mfa.org/collections/object/$1",
),
"P4643": (
"Philadelphia Museum of Art ID",
"http://www.philamuseum.org/collections/permanent/$1.html",
),
"P4659": (
"Musée d'Orsay artwork ID",
"http://www.musee-orsay.fr/en/collections/index-of-works/notice.html?nnumid=$1",
),
"P4673": (
"Museum of Fine Arts, Houston object ID",
"https://www.mfah.org/art/detail/$1",
),
"P4674": (
"Indianapolis Museum of Art artwork ID",
"http://collection.imamuseum.org/artwork/$1/",
),
"P4683": (
"National Gallery of Art artwork ID",
"https://www.nga.gov/content/ngaweb/Collection/art-object-page.$1.html",
),
"P4684": (
"National Gallery of Victoria artwork ID",
"https://www.ngv.vic.gov.au/explore/collection/work/$1/",
),
"P4686": ("Carnegie Museum of Art ID", "https://collection.cmoa.org/objects/$1"),
"P4692": (
"American Art Collaborative object ID",
"http://browse.americanartcollaborative.org/object/$1.html",
),
"P4701": (
"Google Arts & Culture asset ID",
"https://artsandculture.google.com/asset/wd/$1",
),
"P4704": (
"Smithsonian American Art Museum ID",
"https://americanart.si.edu/collections/search/artwork/?id=$1",
),
"P4709": (
"Barnes Foundation ID",
"https://collection.barnesfoundation.org/objects/$1/details",
),
"P4712": (
"Minneapolis Institute of Art artwork ID",
"https://collections.artsmia.org/art/$1",
),
"P4713": ("Walters Art Museum ID", "http://art.thewalters.org/detail/$1"),
"P4721": (
"MuBE Virtual ID",
"http://mubevirtual.com.br/pt_br?Dados&area=ver&id=$1",
),
"P4737": (
"Solomon R. Guggenheim Foundation artwork ID",
"https://www.guggenheim.org/artwork/$1",
),
"P4738": (
"Yale Center for British Art artwork ID",
"http://collections.britishart.yale.edu/vufind/Record/$1",
),
"P4739": (
"Musée des Augustins artwork ID",
"https://www.augustins.org/fr/oeuvre/-/oeuvre/$1",
),
"P4740": (
"Brooklyn Museum artwork ID",
"https://www.brooklynmuseum.org/opencollection/objects/$1",
),
"P4761": ("Images d'Art artwork ID", "http://art.rmngp.fr/en/library/artworks/$1"),
"P4764": (
"Arcade artwork ID",
"http://www.culture.gouv.fr/public/mistral/arcade_fr?ACTION=CHERCHER&FIELD_1=REF&VALUE_1=$1",
),
"P4814": (
"Inventories of American Painting and Sculpture control number",
"https://siris-artinventories.si.edu/ipac20/ipac.jsp?&menu=search&index=.NW&term=$1",
),
"P4905": ("KMSKA work PID", "http://kmska.be/collection/work/id/$1"),
"P5210": (
"National Gallery of Armenia work ID",
"http://www.gallery.am/en/database/item/$1/",
),
"P5223": (
"Information Center for Israeli Art artwork ID",
"http://museum.imj.org.il/artcenter/includes/item.asp?id=$1",
),
"P5265": (
"Dordrechts Museum artwork ID",
"https://www.dordrechtsmuseum.nl/objecten/id/$1",
),
"P5268": ("MNAV work ID", "http://acervo.mnav.gub.uy/obras.php?q=ni:$1"),
"P5269": ("Web umenia work ID", "https://www.webumenia.sk/dielo/$1"),
"P5407": ("MHK object ID", "http://datenbank.museum-kassel.de/$1"),
"P5499": ("Boijmans work ID", "https://www.boijmans.nl/en/collection/artworks/$1"),
"P5783": ("Cranach Digital Archive artwork ID", "http://lucascranach.org/$1"),
"P5823": ("Belvedere object ID", "https://digital.belvedere.at/objects/$1/"),
"P5891": ("Bpk-ID", "http://www.bpk-images.de/id/$1"),
"P6004": (
"Brasiliana Iconográfica ID",
"https://www.brasilianaiconografica.art.br/obras/$1/wd",
),
"P6007": ("Salons ID", "http://salons.musee-orsay.fr/index/notice/$1"),
"P6020": ("d'Art d'Art ! ID", "https://www.france.tv/france-2/d-art-d-art/$1.html"),
"P6141": (
"À nos grands hommes ID",
"https://anosgrandshommes.musee-orsay.fr/index.php/Detail/objects/$1",
),
"P6152": (
"National Portrait Gallery (United States) object ID",
"http://npg.si.edu/object/npg_$1",
),
"P6238": (
"Monument aux morts ID",
"https://monumentsmorts.univ-lille.fr/monument/$1/wd/",
),
"P6239": (
"IEC commemorative monument of Catalonia ID",
"https://monuments.iec.cat/fitxa.asp?id=$1",
),
"P6246": (
"Paris Musées work ID",
"http://parismuseescollections.paris.fr/en/node/$1",
),
"P6310": ("Muséosphère work ID", "http://museosphere.paris.fr/oeuvres/$1"),
"P6332": ("Panorama de l'art ID", "https://www.panoramadelart.com/$1"),
"P6355": ("MNAM artwork ID", "https://collection.centrepompidou.fr/#/artwork/$1"),
"P6356": (
"IHOI work ID",
"http://www.ihoi.org/app/photopro.sk/ihoi_icono/detail?docid=$1&lang=eng",
),
"P6358": (
"Musée Picasso artwork ID",
"https://www.navigart.fr/picassoparis/#/artwork/$1",
),
"P6372": (
"Interpol WOA artwork ID (OBSOLETE)",
"https://www.interpol.int/notice/search/woa/$1",
),
"P6374": (
"MAMVP artwork ID",
"http://www.mam.paris.fr/en/online-collections#/artwork/$1",
),
"P6489": (
"Joan Miró Online Image Bank ID",
"https://www.successiomiro.com/catalogue/object/$1",
),
"P6506": ("Eliseu Visconti Project ID", "https://eliseuvisconti.com.br/obra/$1"),
"P6565": (
"Musenor artwork ID",
"https://webmuseo.com/ws/musenor/app/collection/record/$1",
),
"P6576": (
"Art Fund artwork ID",
"https://www.artfund.org/supporting-museums/art-weve-helped-buy/artwork/$1/wd",
),
"P6595": (
"Paintings by Salvador Dalí ID",
"https://www.salvador-dali.org/en/artwork/catalogue-raisonne/obra/$1/",
),
"P6610": ("Ashmolean museum ID", "http://collections.ashmolean.org/object/$1"),
"P6625": (
"Salvador Dali Museum ID",
"http://archive.thedali.org/mwebcgi/mweb.exe?request=record;id=$1;type=101",
),
"P6629": ("Artcurial lot ID", "https://www.artcurial.com/en/$1"),
"P6631": ("Tainacan MHN ID", "http://mhn.acervos.museus.gov.br/reserva-tecnica/$1"),
"P6633": ("Cini Foundation ID", "http://arte.cini.it/Opere/$1"),
"P6643": ("TV Spielfilm series ID", "https://www.tvspielfilm.de/serien/$1"),
"P6738": (
"Whitney Museum of American Art artwork ID",
"https://whitney.org/collection/works/$1",
),
"P7229": (
"Fundación Goya en Aragón ID",
"https://fundaciongoyaenaragon.es/obra/wd/$1",
),
} }
def lookup(property_id, value):
def lookup(property_id: str, value: str) -> dict[str, str]:
"""Lookup property ID."""
label, formatter = table[property_id] label, formatter = table[property_id]
url = formatter.replace("$1", value) url = formatter.replace('$1', value)
return { return {
"label": label, 'label': label,
"url": url, 'url': url,
"value": value, 'value': value,
} }
def find_catalog_id(entity):
return table.keys() & entity['claims'].keys()
def find_catalog_id(entity: Entity) -> set[str]: def check_catalog(entity, catalog):
"""Find catalog property IDs that appear in entity claims.""" catalog_url = catalog['url']
keys: set[str] = table.keys() & entity["claims"].keys() catalog_ids = catalog['ids']
return keys
if 'P4704' in entity['claims']:
def check_catalog(entity: Entity, catalog: CatalogDict) -> None: saam_id = wikibase.first_datavalue(entity, 'P4704')
"""Check catalog."""
catalog_url = catalog["url"]
catalog_ids = catalog["ids"]
if "P4704" in entity["claims"]:
saam_id = wikibase.first_datavalue(entity, "P4704")
assert saam_id and isinstance(saam_id, (str, int))
cat = saam.get_catalog(saam_id) cat = saam.get_catalog(saam_id)
if cat: if cat:
catalog.update(cat) catalog.update(cat)
return None return
if "P4709" in entity["claims"]: if 'P4709' in entity['claims']:
catalog_id = wikibase.first_datavalue(entity, "P4709") catalog_id = wikibase.first_datavalue(entity, 'P4709')
assert catalog_id and isinstance(catalog_id, (str, int))
cat = barnesfoundation.get_catalog(catalog_id) cat = barnesfoundation.get_catalog(catalog_id)
if cat: if cat:
catalog.update(cat) catalog.update(cat)
return None return
institutions = [ institutions = [
("www.dia.org", dia), ('www.dia.org', dia),
("www.rijksmuseum.nl", rijksmuseum), ('www.rijksmuseum.nl', rijksmuseum),
("www.npg.org.uk", npg), ('www.npg.org.uk', npg),
("www.museodelprado.es", museodelprado), ('www.museodelprado.es', museodelprado),
] ]
if catalog_url: if catalog_url:
@ -349,26 +151,25 @@ def check_catalog(entity: Entity, catalog: CatalogDict) -> None:
if not cat: if not cat:
continue continue
catalog.update(cat) catalog.update(cat)
return None return
try: try:
html = get_catalog_url(catalog_url) html = get_catalog_url(catalog_url)
if html: if html:
description = get_description_from_page(html) description = get_description_from_page(html)
if description: if description:
catalog["description"] = description catalog['description'] = description
return None return
except UnicodeDecodeError: except UnicodeDecodeError:
return None return
for property_id in sorted(catalog_ids): for property_id in sorted(catalog_ids):
if property_id == "P350": if property_id == 'P350':
continue # RKDimages ID continue # RKDimages ID
value = wikibase.first_datavalue(entity, property_id) value = wikibase.first_datavalue(entity, property_id)
# identifier can be 'no value', example: Q26754456 # identifier can be 'no value', example: Q26754456
if value is None: if value is None:
continue continue
assert isinstance(value, str)
detail = lookup(property_id, value) detail = lookup(property_id, value)
try: try:
html = get_catalog_page(property_id, value) html = get_catalog_page(property_id, value)
@ -380,13 +181,11 @@ def check_catalog(entity: Entity, catalog: CatalogDict) -> None:
if not description: if not description:
continue continue
catalog = { catalog = {
"institution": detail["label"], 'institution': detail['label'],
"description": description, 'description': description,
} }
def get_catalog_from_artwork(entity):
def get_catalog_from_artwork(entity: Entity) -> CatalogDict:
"""Get catalog from artwork."""
catalog_ids = find_catalog_id(entity) catalog_ids = find_catalog_id(entity)
catalog_detail = [] catalog_detail = []
for property_id in sorted(catalog_ids): for property_id in sorted(catalog_ids):
@ -394,54 +193,47 @@ def get_catalog_from_artwork(entity: Entity) -> CatalogDict:
# identifier can be 'no value', example: Q26754456 # identifier can be 'no value', example: Q26754456
if value is None: if value is None:
continue continue
assert isinstance(value, str)
detail = lookup(property_id, value) detail = lookup(property_id, value)
catalog_detail.append(detail) catalog_detail.append(detail)
url = wikibase.first_datavalue(entity, "P973") catalog = {
assert isinstance(url, str) 'url': wikibase.first_datavalue(entity, 'P973'),
catalog: CatalogDict = { 'detail': catalog_detail,
"url": url, 'ids': catalog_ids,
"detail": catalog_detail,
"ids": catalog_ids,
} }
try: try:
check_catalog(entity, catalog) check_catalog(entity, catalog)
except ( except (requests.exceptions.ReadTimeout,
requests.exceptions.ReadTimeout,
requests.exceptions.ConnectTimeout, requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError, requests.exceptions.ConnectionError,
requests.exceptions.TooManyRedirects, requests.exceptions.TooManyRedirects):
):
pass pass
return catalog return catalog
def get_description_from_page(html):
def get_description_from_page(html: bytes) -> str | None:
"""Check HTML for description of artwork."""
if not html: if not html:
return None return
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]') div = root.find('.//div[@itemprop="description"]')
if div is not None: if div is not None:
return div.text return div.text
div_list = root.find_class("item-description") div_list = root.find_class('item-description')
if len(div_list): if len(div_list):
return div_list[0].text_content() return div_list[0].text_content()
meta_twitter_description = root.find('.//meta[@name="twitter:description"]') meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None: if meta_twitter_description is None:
return None return
twitter_description = meta_twitter_description.get("content") twitter_description = meta_twitter_description.get('content')
if not twitter_description: if not twitter_description:
return None return
twitter_description = twitter_description.strip() twitter_description = twitter_description.strip()
if not twitter_description: if not twitter_description:
return None return
for element in root.getiterator(): for element in root.getiterator():
if not element.text: if not element.text:
@ -454,35 +246,33 @@ def get_description_from_page(html: bytes) -> str | None:
return twitter_description return twitter_description
def get_catalog_page(property_id, value):
def get_catalog_page(property_id: str, value: str) -> bytes:
"""Get catalog page."""
detail = lookup(property_id, value) detail = lookup(property_id, value)
url = detail["url"] url = detail['url']
catalog_id = value.replace("/", "_") catalog_id = value.replace('/', '_')
filename = f"cache/{property_id}_{catalog_id}.html" filename = f'cache/{property_id}_{catalog_id}.html'
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename, "rb").read() html = open(filename, 'rb').read()
else: else:
r = requests.get(url, headers={"User-Agent": user_agent}, timeout=2) r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
html = r.content html = r.content
open(filename, "wb").write(html) open(filename, 'wb').write(html)
return html return html
def get_catalog_url(url):
def get_catalog_url(url: str) -> bytes: md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
"""Get catalog URL and cache.""" filename = 'cache/' + md5_filename
md5_filename = hashlib.md5(url.encode("utf-8")).hexdigest() + ".html"
filename = "cache/" + md5_filename
if os.path.exists(filename): if os.path.exists(filename):
html = open(filename, "rb").read() html = open(filename, 'rb').read()
else: else:
r = relaxed_ssl.get(url, headers={"User-Agent": user_agent}, timeout=2) r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.content html = r.content
open(filename, "wb").write(html) open(filename, 'wb').write(html)
return html return html

View file

@ -1,93 +1,59 @@
"""Wikidata query service.""" import requests
import hashlib
import json import json
import os
import typing
import urllib.parse import urllib.parse
import os
import dateutil.parser
import hashlib
from flask import request, render_template, g
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
import dateutil.parser
import requests
from flask import g, render_template, request
from . import database, utils
from .model import WikidataQuery from .model import WikidataQuery
from . import utils, database
query_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" query_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
url_start = "http://www.wikidata.org/entity/Q" url_start = 'http://www.wikidata.org/entity/Q'
commons_start = "http://commons.wikimedia.org/wiki/Special:FilePath/" commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
Row = dict[str, dict[str, typing.Any]]
class QueryError(Exception): class QueryError(Exception):
"""Query error.""" def __init__(self, query, r):
query: str
r: requests.Response
def __init__(self, query: str, r: requests.Response) -> None:
"""Init."""
self.query = query self.query = query
self.r = r self.r = r
class QueryTimeout(QueryError): class QueryTimeout(QueryError):
"""Query timeout.""" def __init__(self, query, r):
def __init__(self, query: str, r: requests.Response) -> None:
"""Init."""
self.query = query self.query = query
self.r = r self.r = r
def row_id(row, field='item'):
return int(utils.drop_start(row[field]['value'], url_start))
def row_id(row: Row, field: str = "item") -> int: def get_row_value(row, field):
"""Get item_id for row.""" return row[field]['value'] if field in row else None
return int(utils.drop_start(row[field]["value"], url_start))
def get_row_text(row, field):
if field in row and 'xml:lang' in row[field]:
return row[field]['value']
def get_row_value(row: Row, field: str) -> str | None: def commons_uri_to_filename(uri):
"""Get value from row field."""
return typing.cast(str, row[field]["value"]) if field in row else None
def get_row_text(row: Row, field: str) -> str | None:
"""Get text from row field."""
return row[field]["value"] if field in row and "xml:lang" in row[field] else None
def commons_uri_to_filename(uri: str) -> str:
"""Commons URI to filename."""
return urllib.parse.unquote(utils.drop_start(uri, commons_start)) return urllib.parse.unquote(utils.drop_start(uri, commons_start))
def run_from_template(template_name, **context):
def run_from_template(template_name: str, **context: typing.Any) -> requests.Response:
query = render_template(template_name, **context) query = render_template(template_name, **context)
return run_query(query, query_template=template_name) return run_query(query, query_template=template_name)
def run_from_template_with_cache(template_name, cache_name=None, **context):
def run_from_template_with_cache(
template_name: str, cache_name: str | None = None, **context: typing.Any
) -> list[Row]:
query = render_template(template_name, **context) query = render_template(template_name, **context)
return run_query_with_cache(query, name=cache_name, query_template=template_name) return run_query_with_cache(query, name=cache_name, query_template=template_name)
def run_query(query, **kwargs):
def run_query(query: str, **kwargs: typing.Any) -> requests.Response:
"""Run WDQS query."""
r, db_query = record_query(query, **kwargs) r, db_query = record_query(query, **kwargs)
return r return r
def record_query(query, query_template=None):
def record_query( params = {'query': query, 'format': 'json'}
query: str, query_template: str | None = None
) -> tuple[requests.Response, WikidataQuery]:
params = {"query": query, "format": "json"}
start = datetime.utcnow() start = datetime.utcnow()
path = request.full_path.rstrip("?") if request else None path = request.full_path.rstrip('?') if request else None
endpoint = request.endpoint if request else None endpoint = request.endpoint if request else None
db_query = WikidataQuery( db_query = WikidataQuery(
@ -95,9 +61,8 @@ def record_query(
sparql_query=query, sparql_query=query,
path=path, path=path,
query_template=query_template, query_template=query_template,
page_title=getattr(g, "title", None), page_title=getattr(g, 'title', None),
endpoint=endpoint, endpoint=endpoint)
)
database.session.add(db_query) database.session.add(db_query)
database.session.commit() database.session.commit()
@ -109,84 +74,78 @@ def record_query(
db_query.error_text = r.text db_query.error_text = r.text
database.session.commit() database.session.commit()
if "java.util.concurrent.TimeoutException" in r.text: if 'java.util.concurrent.TimeoutException' in r.text:
raise QueryTimeout(query, r) raise QueryTimeout(params, r)
else: else:
raise QueryError(query, r) raise QueryError(params, r)
database.session.commit() database.session.commit()
return r, db_query return r, db_query
def md5_query(query):
''' generate the md5 hexdigest of a SPARQL query '''
return hashlib.md5(query.encode('utf-8')).hexdigest()
def md5_query(query: str) -> str: def run_query_with_cache(q, name=None, query_template=None):
"""generate the md5 hexdigest of a SPARQL query."""
return hashlib.md5(query.encode("utf-8")).hexdigest()
def run_query_with_cache(
q: str, name: str | None = None, query_template: str | None = None
) -> list[Row]:
if name is None: if name is None:
name = md5_query(q) name = md5_query(q)
filename = f"cache/{name}.json" filename = f'cache/{name}.json'
if os.path.exists(filename): if os.path.exists(filename):
from_cache = json.load(open(filename)) from_cache = json.load(open(filename))
if isinstance(from_cache, dict) and from_cache.get("query") == q: if isinstance(from_cache, dict) and from_cache.get('query') == q:
return typing.cast(list[Row], from_cache["bindings"]) return from_cache['bindings']
r, db_query = record_query(q, query_template=query_template) r, db_query = record_query(q, query_template=query_template)
bindings: list[Row] = r.json()["results"]["bindings"] bindings = r.json()['results']['bindings']
json.dump({"query": q, "bindings": bindings}, open(filename, "w"), indent=2) json.dump({'query': q, 'bindings': bindings},
open(filename, 'w'), indent=2)
db_query.row_count = len(bindings) db_query.row_count = len(bindings)
database.session.commit() database.session.commit()
return bindings return bindings
def format_time(row_time, row_timeprecision):
def format_time(row_time: dict[str, str], row_timeprecision: dict[str, int]) -> str: t = dateutil.parser.parse(row_time['value'])
"""Format time with given precision.""" precision = int(row_timeprecision['value'])
t = dateutil.parser.parse(row_time["value"])
precision = int(row_timeprecision["value"])
if precision == 9: if precision == 9:
return str(t.year) return t.year
if precision == 8: if precision == 8:
return f"{t.year}s" return f'{t.year}s'
if precision == 7: if precision == 7:
return f"{utils.ordinal((t.year // 100) + 1)} century" return f'{utils.ordinal((t.year // 100) + 1)} century'
if precision == 6: if precision == 6:
return f"{utils.ordinal((t.year // 1000) + 1)} millennium" return f'{utils.ordinal((t.year // 1000) + 1)} millennium'
return row_time["value"] return row_time['value']
def build_browse_item_map(bindings):
def build_browse_item_map(bindings: list[Row]) -> dict[int, dict[str, typing.Any]]:
row_map = defaultdict(list) row_map = defaultdict(list)
for row in bindings: for row in bindings:
item_id = row_id(row) item_id = row_id(row)
label = row["itemLabel"]["value"] label = row['itemLabel']['value']
image_filename = commons_uri_to_filename(row["image"]["value"]) image_filename = commons_uri_to_filename(row['image']['value'])
artist_name = get_row_value(row, "artistLabel") artist_name = get_row_value(row, 'artistLabel')
d = format_time(row["time"], row["timeprecision"]) if "time" in row else None d = format_time(row['time'], row['timeprecision']) if 'time' in row else None
row_qid = f"Q{item_id}" row_qid = f'Q{item_id}'
item = { item = {
"image_filename": image_filename, 'image_filename': image_filename,
"date": d, 'date': d,
"depicts": row["depictsList"]["value"].split("|"), 'depicts': row['depictsList']['value'].split('|'),
} }
if artist_name: if artist_name:
item["artist_name"] = artist_name item['artist_name'] = artist_name
if label != row_qid: if label != row_qid:
item["label"] = label item['label'] = label
title = get_row_value(row, "title") title = get_row_value(row, 'title')
if title: if title:
lang = get_row_value(row, "titleLang") lang = get_row_value(row, 'titleLang')
item["title"] = (lang, title) item['title'] = (lang, title)
row_map[item_id].append(item) row_map[item_id].append(item)
@ -199,59 +158,53 @@ def build_browse_item_map(bindings: list[Row]) -> dict[int, dict[str, typing.Any
when = None when = None
depicts = [] depicts = []
for item in items: for item in items:
if "title" in item: if 'title' in item:
lang, title = item["title"] lang, title = item['title']
titles[lang] = title titles[lang] = title
filenames.add(item["image_filename"]) filenames.add(item['image_filename'])
artist_name = item.get("artist_name") artist_name = item.get('artist_name')
if artist_name and artist_name not in artist_names: if artist_name and artist_name not in artist_names:
artist_names.append(artist_name) artist_names.append(artist_name)
if "label" in item: if 'label' in item:
labels.add(item["label"]) labels.add(item['label'])
if when is None and item.get("date"): if when is None and item.get('date'):
when = item["date"] when = item['date']
for d in item["depicts"]: for d in item['depicts']:
if d not in depicts: if d not in depicts:
depicts.append(d) depicts.append(d)
item = { item = {
"qid": f"Q{item_id}", 'qid': f'Q{item_id}',
"item_id": item_id, 'item_id': item_id,
"image_filename": list(filenames), 'image_filename': list(filenames),
"artist_name": ", ".join(artist_names), 'artist_name': ', '.join(artist_names),
"date": when, 'date': when,
"depicts": depicts, 'depicts': depicts,
} }
if artist_names: if artist_names:
item["artist_name"] = ", ".join(artist_names) item['artist_name'] = ', '.join(artist_names)
if labels: if labels:
assert len(labels) == 1 assert len(labels) == 1
item["label"] = list(labels)[0] item['label'] = list(labels)[0]
elif "en" in titles: elif 'en' in titles:
item["label"] = titles["en"] item['label'] = titles['en']
else: else:
item["label"] = "[ label missing ]" item['label'] = '[ label missing ]'
item_map[item_id] = item item_map[item_id] = item
return item_map return item_map
def quote_list(l):
no_dups = list(dict.fromkeys(l)) # remove duplicates
return ' '.join('("' + s.replace('"', '\\"') + '")' for s in no_dups)
def quote_list(list_of_strings: list[str]) -> str: def url_list(l):
"""Quote strings and combine into list for SPARQL query.""" no_dups = list(dict.fromkeys(l)) # remove duplicates
no_dups = list(dict.fromkeys(list_of_strings)) # remove duplicates return ' '.join(f'(<{s}>)' for s in no_dups)
return " ".join('("' + s.replace('"', '\\"') + '")' for s in no_dups)
def is_artificial_physical_object(qid):
def url_list(urls: list[str]) -> str: bindings = run_from_template_with_cache('query/item_type.sparql', qid=qid)
"""Combine URLs into list for SPARQL query.""" types = {row_id(row, field='item') for row in bindings}
no_dups = list(dict.fromkeys(urls)) # remove duplicates
return " ".join(f"(<{s}>)" for s in no_dups)
def is_artificial_physical_object(qid: str) -> bool:
"""Item is artificial physical object."""
bindings = run_from_template_with_cache("query/item_type.sparql", qid=qid)
types = {row_id(row, field="item") for row in bindings}
# Q8205328 == artificial physical object # Q8205328 == artificial physical object
return 8205328 in types return 8205328 in types

View file

@ -1,51 +1,25 @@
"""Wikibase functions.""" def first_datavalue(entity, pid):
if pid in entity['claims']:
mainsnak = entity['claims'][pid][0]['mainsnak']
if 'datavalue' in mainsnak:
return mainsnak['datavalue']['value']
import typing def get_entity_label(entity):
if 'labels' not in entity:
from .type import Entity
def first_datavalue(
entity: Entity, pid: str
) -> str | int | None | dict[str, typing.Any]:
"""Get first datavalue from claim."""
if pid in entity["claims"]:
mainsnak = entity["claims"][pid][0]["mainsnak"]
if "datavalue" in mainsnak:
v = mainsnak["datavalue"]["value"]
assert isinstance(v, str | int)
return v
return None return None
if 'en' in entity['labels']:
return entity['labels']['en']['value']
label_values = {l['value'] for l in entity['labels'].values()}
def get_entity_label(entity: Entity) -> str | None:
"""Get entity label."""
if "labels" not in entity:
return None
label: str
if "en" in entity["labels"]:
label = entity["labels"]["en"]["value"]
assert isinstance(label, str)
return label
label_values = {lang["value"] for lang in entity["labels"].values()}
if len(label_values) == 1: if len(label_values) == 1:
label = list(label_values)[0] return list(label_values)[0]
assert isinstance(label, str)
return label
return None
def get_en_value(entity, key):
if 'en' in entity[key]:
return entity[key]['en']['value']
def get_en_value(entity: Entity, key: str) -> str | None: def get_en_label(entity):
"""Get English value from label or description.""" return get_en_value(entity, 'labels')
return entity[key]["en"]["value"] if "en" in entity[key] else None # type: ignore
def get_en_description(entity):
def get_en_label(entity: Entity) -> str | None: return get_en_value(entity, 'descriptions')
"""Get English label."""
return get_en_value(entity, "labels")
def get_en_description(entity: Entity) -> str | None:
"""Get English description of enttity."""
return get_en_value(entity, "descriptions")

View file

@ -1,24 +1,17 @@
"""Make an edit to Wikidata."""
from depicts import mediawiki, wikibase from depicts import mediawiki, wikibase
from depicts.model import DepictsItem from depicts.model import DepictsItem
def create_depicts_item(item_id):
def create_depicts_item(item_id: int) -> DepictsItem: qid = f'Q{item_id}'
"""Create depicts item."""
qid = f"Q{item_id}"
entity = mediawiki.get_entity(qid) entity = mediawiki.get_entity(qid)
assert entity
if "en" in entity["aliases"]: if 'en' in entity['aliases']:
alt_labels = {alt["value"] for alt in entity["aliases"]["en"]} alt_labels = {alt['value'] for alt in entity['aliases']['en']}
else: else:
alt_labels = set() alt_labels = set()
return DepictsItem( return DepictsItem(item_id=item_id,
item_id=item_id,
label=wikibase.get_en_label(entity), label=wikibase.get_en_label(entity),
description=wikibase.get_en_description(entity), description=wikibase.get_en_description(entity),
alt_labels=alt_labels, alt_labels=alt_labels,
count=0, count=0)
)

View file

@ -1,93 +1,67 @@
import typing
from urllib.parse import urlencode
import requests
from flask import current_app, session from flask import current_app, session
from requests_oauthlib import OAuth1Session from requests_oauthlib import OAuth1Session
from urllib.parse import urlencode
from .type import CallParams def get_edit_proxy():
edit_proxy = current_app.config.get('EDIT_PROXY')
def get_edit_proxy() -> dict[str, str]:
edit_proxy = current_app.config.get("EDIT_PROXY")
if edit_proxy: if edit_proxy:
return {"http": edit_proxy, "https": edit_proxy} return {'http': edit_proxy, 'https': edit_proxy}
else: else:
return {} return {}
def api_post_request(params):
def api_post_request(params: dict[str, str | int]) -> requests.Response:
app = current_app app = current_app
url = "https://www.wikidata.org/w/api.php" url = 'https://www.wikidata.org/w/api.php'
client_key = app.config["CLIENT_KEY"] client_key = app.config['CLIENT_KEY']
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config['CLIENT_SECRET']
oauth = OAuth1Session( oauth = OAuth1Session(client_key,
client_key,
client_secret=client_secret, client_secret=client_secret,
resource_owner_key=session["owner_key"], resource_owner_key=session['owner_key'],
resource_owner_secret=session["owner_secret"], resource_owner_secret=session['owner_secret'])
)
proxies = get_edit_proxy() proxies = get_edit_proxy()
r: requests.Response = oauth.post(url, data=params, timeout=4, proxies=proxies) return oauth.post(url, data=params, timeout=4, proxies=proxies)
return r
def raw_request(params):
def raw_request(params: dict[str, str | int]) -> requests.Response:
app = current_app app = current_app
url = "https://www.wikidata.org/w/api.php?" + urlencode(params) url = 'https://www.wikidata.org/w/api.php?' + urlencode(params)
client_key = app.config["CLIENT_KEY"] client_key = app.config['CLIENT_KEY']
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config['CLIENT_SECRET']
oauth = OAuth1Session( oauth = OAuth1Session(client_key,
client_key,
client_secret=client_secret, client_secret=client_secret,
resource_owner_key=session["owner_key"], resource_owner_key=session['owner_key'],
resource_owner_secret=session["owner_secret"], resource_owner_secret=session['owner_secret'])
)
proxies = get_edit_proxy() proxies = get_edit_proxy()
r: requests.Response = oauth.get(url, timeout=4, proxies=proxies) return oauth.get(url, timeout=4, proxies=proxies)
return r
def api_request(params):
return raw_request(params).json()
def api_request(params: dict[str, str | int]) -> dict[str, typing.Any]: def get_token():
data: dict[str, typing.Any] = raw_request(params).json() params = {
return data 'action': 'query',
'meta': 'tokens',
'format': 'json',
def get_token() -> str: 'formatversion': 2,
params: CallParams = {
"action": "query",
"meta": "tokens",
"format": "json",
"formatversion": 2,
} }
reply = api_request(params) reply = api_request(params)
token: str = reply["query"]["tokens"]["csrftoken"] token = reply['query']['tokens']['csrftoken']
return token return token
def userinfo_call():
def userinfo_call() -> dict[str, typing.Any]: params = {'action': 'query', 'meta': 'userinfo', 'format': 'json'}
params: dict[str, str | int] = {
"action": "query",
"meta": "userinfo",
"format": "json",
}
return api_request(params) return api_request(params)
def get_username():
if 'owner_key' not in session:
return # not authorized
def get_username() -> str | None: if 'username' in session:
if "owner_key" not in session: return session['username']
return None # not authorized
username: str
if "username" in session:
username = session["username"]
return username
reply = userinfo_call() reply = userinfo_call()
if "query" not in reply: if 'query' not in reply:
return None return
username = reply["query"]["userinfo"]["name"] session['username'] = reply['query']['userinfo']['name']
session["username"] = username
return username return session['username']

View file

@ -2,7 +2,7 @@
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" crossorigin="anonymous"> <link rel="stylesheet" href="{{ url_for('static', filename='javascript/bootstrap4/css/bootstrap.min.css') }}">
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title> <title>
@ -27,7 +27,8 @@
{% block content %}{% endblock %} {% block content %}{% endblock %}
<script src="{{ url_for('static', filename='javascript/jquery/jquery.min.js') }}"></script> <script src="{{ url_for('static', filename='javascript/jquery/jquery.min.js') }}"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-C6RzsynM9kWDrMNeT87bh95OGNyZPhcTNXj1NW7RuBCsyN/o0jlpcV8Qyq46cDfL" crossorigin="anonymous"></script> {# <script src="{{ url_for('static', filename='javascript/popper.js/popper.min.js') }}"></script> #}
<script src="{{ url_for('static', filename='javascript/bootstrap4/js/bootstrap.min.js') }}"></script>
{% block script %}{% endblock %} {% block script %}{% endblock %}
</body> </body>

View file

@ -6,40 +6,16 @@
<style> <style>
div.description { margin-left: 2em; color: rgb(96, 96, 96); } div.description { margin-left: 2em; color: rgb(96, 96, 96); }
span.description { color: rgb(96, 96, 96); } span.description { color: rgb(96, 96, 96); }
#artwork {
position: fixed; /* This keeps the map in place when the page is scrolled */
top: 56px;
left: 0; /* Positioned on the right side */
width: 50%; /* Half the screen width */
bottom: 0px;
z-index: -1;
}
#artwork img {
object-fit: contain; /* The image will be scaled to maintain its aspect ratio */
width: 100%;
height: 100%;
}
#main {
float: right; /* Floats the main content to the right */
width: 48%; /* Adjusted width of the main content */
height: auto; /* Height is set to auto, allowing it to expand naturally */
margin-right: 1%;
}
</style> </style>
{% endblock %} {% endblock %}
{% block content %} {% block content %}
<div id="artwork"> <div class="container-fluid mt-2">
<div class="w-100 h-100"> <div class="row">
<img src="{{ image.thumburl }}" /> <div class="col-md">
<img src="{{ image.thumburl }}" class="w-100" />
</div> </div>
</div> <div class="col-md">
<div id="main">
<h1>{{ self.title() }}</h1> <h1>{{ self.title() }}</h1>
{% if label_languages %} {% if label_languages %}
<p>Label from: <p>Label from:
@ -203,6 +179,8 @@ span.description { color: rgb(96, 96, 96); }
</form> </form>
{% endif %} {% endif %}
</div> </div>
</div>
</div>
{% endblock %} {% endblock %}
{% block script %} {% block script %}

View file

@ -5,17 +5,19 @@
{% endmacro %} {% endmacro %}
{% macro navbar_inner(name) %} {% macro navbar_inner(name) %}
<div class="container-fluid"> <button class="navbar-toggler navbar-toggler-right" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<a class="navbar-brand" href="{{ url_for('browse_page') }}">Wikidata Art Depiction Explorer</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span> <span class="navbar-toggler-icon"></span>
</button> </button>
<a class="navbar-brand" href="{{ url_for('browse_page') }}">Wikidata Art Depiction Explorer</a>
<div class="collapse navbar-collapse" id="navbarSupportedContent"> <div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav me-auto"> <ul class="navbar-nav mr-auto">
<ul class="navbar-nav mr-auto">
{{ nav_item('browse_page', 'Browse') }} {{ nav_item('browse_page', 'Browse') }}
{{ nav_item('list_edits', 'Recent changes') }} {{ nav_item('list_edits', 'Recent changes') }}
{{ nav_item('random_artwork', 'Random artwork') }} {{ nav_item('random_artwork', 'Random artwork') }}
</ul> </ul>
</ul>
<ul class="navbar-nav"> <ul class="navbar-nav">
<li class="nav-item"> <li class="nav-item">
{% if g.user %} {% if g.user %}
@ -33,11 +35,10 @@
{% endif %} {% endif %}
</ul> </ul>
</div> </div>
</div>
{% endmacro %} {% endmacro %}
{% macro navbar() %} {% macro navbar() %}
<nav class="navbar navbar-expand-lg bg-dark" data-bs-theme="dark"> <nav class="navbar navbar-toggleable-md navbar-expand-lg navbar-dark bg-dark">
{{ navbar_inner() }} {{ navbar_inner() }}
</nav> </nav>
{% endmacro %} {% endmacro %}