diff --git a/app.py b/app.py index 45781da..fff4ecc 100755 --- a/app.py +++ b/app.py @@ -5,13 +5,17 @@ from depicts import (utils, wdqs, commons, mediawiki, artwork, database, wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit) from depicts.pager import Pagination, init_pager from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, Item, - Language, WikidataQuery) + Language, WikidataQuery, Triple) from depicts.error_mail import setup_error_mail from requests_oauthlib import OAuth1Session from werkzeug.exceptions import InternalServerError from werkzeug.debug.tbtools import get_current_traceback from sqlalchemy import func, distinct +from sqlalchemy.orm import aliased +from sqlalchemy.sql.expression import desc from collections import defaultdict +from datetime import datetime +import itertools import hashlib import json import os @@ -175,38 +179,28 @@ def property_query_page(property_id): sort = request.args.get('sort') sort_by_name = sort and sort.lower().strip() == 'name' - rows = wdqs.run_from_template_with_cache('query/property.sparql', - cache_name=pid, - pid=pid, - isa_list=isa_list) + q = (database.session.query(Triple.object_id, + func.count(func.distinct(Triple.subject_id)).label('c')) + .filter_by(predicate_id=property_id) + .join(Item, Item.item_id == Triple.subject_id) + .filter_by(is_artwork=True) + .group_by(Triple.object_id) + .order_by(desc('c'))) - no_label_qid = [row['object']['value'].rpartition('/')[2] - for row in rows - if 'objectLabel' not in row and '/' in row['object']['value']] + labels = get_labels_db({f'Q{object_id}' for object_id, c in q}) - if no_label_qid: - extra_label = get_labels(no_label_qid, name=f'{pid}_extra_labels') - if extra_label: - for row in rows: - item = row['object']['value'] - if 'objectLabel' in row or '/' not in item: - continue - qid = item.rpartition('/')[2] - if extra_label.get(qid): - row['objectLabel'] = {'value': extra_label[qid]} - - if sort_by_name: - # put rows with no English label at the end - no_label = [row for row in rows if 'objectLabel' not in row] - has_label = sorted((row for row in rows if 'objectLabel' in row), - key=lambda row: locale.strxfrm(row['objectLabel']['value'])) - rows = has_label + no_label + hits = [] + for object_id, count in q: + qid = f'Q{object_id}' + hits.append({'qid': qid, + 'label': labels.get(qid) or '[item missing]', + 'count': count}) return render_template('property.html', label=g.title, order=('name' if sort_by_name else 'count'), pid=pid, - rows=rows) + hits=hits) @app.route('/') def start(): @@ -446,7 +440,9 @@ def get_labels(keys, name=None): if isinstance(from_cache, dict) and from_cache.get('keys') == keys: labels = from_cache['labels'] if not labels: - for cur in utils.chunk(keys, 50): + print(len(keys)) + for num, cur in enumerate(utils.chunk(keys, 50)): + print(f'{num * 50} / {len(keys)}') labels += mediawiki.get_entities(cur, props='labels') json.dump({'keys': keys, 'labels': labels}, @@ -454,6 +450,40 @@ def get_labels(keys, name=None): return {entity['id']: wikibase.get_entity_label(entity) for entity in labels} +def get_labels_db(keys): + keys = set(keys) + labels = {} + missing = set() + for qid in keys: + item = Item.query.get(qid[1:]) + if item: + labels[qid] = item.label + else: + missing.add(qid) + + print(len(missing)) + page_size = 50 + for num, cur in enumerate(utils.chunk(missing, page_size)): + print(f'{num * page_size} / {len(missing)}') + for entity in mediawiki.get_entities(cur): + if 'redirects' in entity: + continue + + qid = entity['id'] + + modified = datetime.strptime(entity['modified'], "%Y-%m-%dT%H:%M:%SZ") + # FIXME: check if the item is an artwork and set is_artwork correctly + item = Item(item_id=qid[1:], + entity=entity, + lastrevid=entity['lastrevid'], + modified=modified, + is_artwork=False) + database.session.add(item) + labels[qid] = item.label + database.session.commit() + + return labels + def build_other_set(entity): other_items = set() for key in find_more_props.keys(): @@ -667,7 +697,7 @@ def catalog_page(): title=title) def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=False): - filenames = [cur['image_filename'] for cur in items] + filenames = [cur.image_filename() for cur in items] if thumbwidth is None: thumbwidth = app.config['THUMBWIDTH'] @@ -682,7 +712,17 @@ def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=Fals return detail def browse_index(): - return render_template('browse_index.html', props=find_more_props) + q = (database.session.query(Triple.predicate_id, + func.count(func.distinct(Triple.object_id))) + .join(Item, Triple.subject_id == Item.item_id) + .filter_by(is_artwork=True) + .group_by(Triple.predicate_id)) + + counts = {f'P{predicate_id}': count for predicate_id, count in q} + + return render_template('browse_index.html', + props=find_more_props, + counts=counts) @app.route('/debug/show_user') def debug_show_user(): @@ -705,41 +745,70 @@ def browse_facets(): facets=facets, prop_labels=find_more_props) +def get_db_items(params): + ''' Get items for browse page based on criteria. ''' + q = Item.query + for pid, qid in params: + q = (q.join(Triple, Item.item_id == Triple.subject_id, aliased=True) + .filter(Triple.predicate_id == pid[1:], Triple.object_id == qid[1:])) + + return q + +def get_db_facets(params): + t = aliased(Triple) + q = database.session.query(t.predicate_id, func.count().label('count'), t.object_id) + facet_limit = 15 + + for pid, qid in params: + q = (q.join(Triple, t.subject_id == Triple.subject_id, aliased=True) + .filter(Triple.predicate_id == pid[1:], + Triple.object_id == qid[1:], + t.predicate_id != pid[1:], + t.object_id != qid[1:])) + + q = q.group_by(t.predicate_id, t.object_id) + + results = sorted(tuple(row) for row in q.all()) + + facet_list = {} + subject_qids = set() + for predicate_id, x in itertools.groupby(results, lambda row: row[0]): + hits = sorted(list(x), key=lambda row: row[1], reverse=True) + values = [{'count': count, 'qid': f'Q{value}'} + for _, count, value in hits[:facet_limit]] + facet_list[f'P{predicate_id}'] = values + subject_qids.update(i['qid'] for i in values) + + print(len(subject_qids)) + labels = get_labels_db(subject_qids) + + for values in facet_list.values(): + for v in values: + v['label'] = labels[v['qid']] + + return facet_list + @app.route('/browse') def browse_page(): + page_size = 45 params = get_artwork_params() if not params: return browse_index() flat = '_'.join(f'{pid}={qid}' for pid, qid in params) - - item_labels = get_labels(qid for pid, qid in params) - - g.title = ' / '.join(find_more_props[pid] + ': ' + item_labels[qid] + item_labels = get_labels_db(qid for pid, qid in params) + g.title = ' / '.join(find_more_props[pid] + ': ' + (item_labels.get(qid) or qid) for pid, qid in params) - bindings = filter_artwork(params) + q_items = get_db_items(params) + facets = get_db_facets(params) - try: - facets = get_facets(params) - except wdqs.QueryError: - facets = {} - - - page_size = 45 - - item_map = wdqs.build_browse_item_map(bindings) - - all_items = [] - for item in item_map.values(): - if len(item['image_filename']) != 1: - continue - item['image_filename'] = item['image_filename'][0] - all_items.append(item) + all_items = q_items.all() page = utils.get_int_arg('page') or 1 - pager = Pagination(page, page_size, len(all_items)) + total = q_items.count() + pager = Pagination(page, page_size, total) items = pager.slice(all_items) @@ -747,29 +816,38 @@ def browse_page(): detail = get_image_detail_with_cache(items, cache_name) cache_refreshed = False + linked_qids = {qid for pid, qid in params} for item in items: - item['url'] = url_for('item_page', item_id=item['item_id']) - image_filename = item['image_filename'] + artist_qid = item.artist + if artist_qid: + linked_qids.add(artist_qid) + for prop in 'P31', 'P180': + linked_qids.update(item.linked_qids(prop)) + + linked_labels = get_labels_db(linked_qids) + + for item in items: + image_filename = item.image_filename() if not cache_refreshed and image_filename not in detail: detail = get_image_detail_with_cache(items, cache_name, refresh=True) cache_refreshed = True - item['image'] = detail[image_filename] + item.image = detail[image_filename] - catalog_url = url_for('catalog_page', **dict(params)) - - return render_template('find_more.html', - facets=facets, - prop_labels=find_more_props, + return render_template('new_find_more.html', + page=page, label=g.title, pager=pager, - params=params, - item_map=item_map, - catalog_url=catalog_url, - page=page, + prop_labels=find_more_props, labels=find_more_props, - bindings=bindings, - total=len(item_map), - items=items) + linked_labels=linked_labels, + items=items, + total=total, + params=params, + facets=facets) + + return jsonify(params=params, + items=items.count(), + facets=facets) @app.route('/find_more.json') def find_more_json(): diff --git a/depicts/model.py b/depicts/model.py index c1d5f97..b23c3c1 100644 --- a/depicts/model.py +++ b/depicts/model.py @@ -1,5 +1,6 @@ from sqlalchemy.ext.declarative import declarative_base from .database import session, now_utc +from . import wikibase, utils from sqlalchemy.schema import Column, ForeignKey from sqlalchemy.types import Integer, String, DateTime, Boolean from sqlalchemy.orm import column_property, relationship, synonym @@ -49,13 +50,59 @@ class DepictsItemAltLabel(Base): class Item(Base): __tablename__ = 'item' item_id = Column(Integer, primary_key=True, autoincrement=False) - label = Column(String) + # label = Column(String) # column removed 2019-12-18 entity = Column(postgresql.JSON) lastrevid = Column(Integer, nullable=True, unique=True) modified = Column(DateTime, nullable=True) is_artwork = Column(Boolean, nullable=False, default=False) qid = column_property('Q' + cast(item_id, String)) + def image_count(self): + p18 = self.entity['claims'].get('P18') + return len(p18) if p18 else 0 + + def image_filename(self): + p18 = self.entity['claims'].get('P18') + if not p18: + return + + try: + return p18[0]['mainsnak']['datavalue']['value'] + except KeyError: + return + + @property + def label(self): + return wikibase.get_entity_label(self.entity) + + @property + def artist(self): + v = wikibase.first_datavalue(self.entity, 'P170') + if not v: + return + return v['id'] + + @property + def depicts(self): + return self.linked_qids('P180') + + @property + def instance_of(self): + return self.linked_qids('P31') + + def linked_qids(self, prop): + values = self.entity['claims'].get(prop) or [] + return [v['mainsnak']['datavalue']['value']['id'] + for v in values + if 'datavalue' in v['mainsnak']] + + + @property + def date(self): + v = wikibase.first_datavalue(self.entity, 'P571') + if v: + return utils.format_time(v['time'], v['precision']) + class Triple(Base): __tablename__ = 'triple' subject_id = Column(Integer, primary_key=True) diff --git a/depicts/utils.py b/depicts/utils.py index 132386f..48c3ae7 100644 --- a/depicts/utils.py +++ b/depicts/utils.py @@ -1,5 +1,6 @@ from flask import request from itertools import islice +from datetime import datetime import urllib.parse import inflect @@ -74,3 +75,25 @@ def wiki_url(title, site, ns=None): def get_int_arg(name): if name in request.args and request.args[name].isdigit(): return int(request.args[name]) + +def format_time(time_value, precision): + # FIXME handle dates like '1965-04-00T00:00:00Z' + # FIXME handle BC dates properly, "120 B.C." instead of "-120" + year = None + if '-00' in time_value: + # can't be represented as python datetime + year = int(time_value[:time_value.find('-', 1)]) + else: + t = datetime.strptime(time_value[1:], "%Y-%m-%dT%H:%M:%SZ") + year = t.year + + if precision == 9: + return str(year) + if precision == 8: + return f'{year}s' + if precision == 7: + return f'{ordinal((year // 100) + 1)} century' + if precision == 6: + return f'{ordinal((year // 1000) + 1)} millennium' + + return time_value diff --git a/templates/browse_index.html b/templates/browse_index.html index 98c87b7..019fb3d 100644 --- a/templates/browse_index.html +++ b/templates/browse_index.html @@ -19,7 +19,7 @@ diff --git a/templates/find_more.html b/templates/find_more.html index fc5bcb4..5d23b89 100644 --- a/templates/find_more.html +++ b/templates/find_more.html @@ -20,6 +20,14 @@ {% endfor %} #} +

+ Current filter – + {% for pid, qid in params %} + {{ prop_labels[pid] }}: {{ linked_labels[qid] }} + [remove filter] + {% endfor %} +

+

browse index {% for pid, qid in params %} @@ -31,34 +39,48 @@

toggle filters + {# catalog artwork + #}

{% for key, values in facets.items() %}

{{ prop_labels[key] }}: {% for v in values %} - {{ v.label }} ({{ v.count }}) + {{ v.label }} + ({{ '{:,d}'.format(v.count) }}) {% if not loop.last %}|{% endif %} {% endfor %}

{% endfor %}
+{{ render_pagination(pager) }} +
{% for item in items %} {% set image = item.image %}
- + {# #}
{{ item.label }}
-

by {{ item.artist_name }} +

+

+ {% for qid in item.instance_of %} + {% if not loop.first %} / {% endif %} + {{ linked_labels[qid] }} + {% endfor %} +
+ {% if item.artist %} + by {{ linked_labels[item.artist] }} + {% endif %} {% if item.date %}({{ item.date }}){% endif %}
- {% for depicts_label in item.depicts %} - {{ depicts_label }} + {% for depicts_qid in item.depicts %} + {{ linked_labels[depicts_qid] }} {% endfor %}

diff --git a/templates/property.html b/templates/property.html index 40ecbcc..19208d3 100644 --- a/templates/property.html +++ b/templates/property.html @@ -16,19 +16,17 @@ {% endif %}

-
    - {% for row in rows if '/' in row.object.value %} - {% set qid = row.object.value.rpartition('/')[2] %} - {% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %} -
  • - {{ row_label }} - {% if 'objectDescription' in row %} - — {{ row.objectDescription.value }} - {% endif %} +

    Total: {{ hits | length }}

    - ({{ '{:,d}'.format(row.count.value | int) }} artworks) - {% if 'objectLabel' not in row %} - view in Wikidata +
      + {% for hit in hits %} +
    • + {{ hit.label }} + + ({{ '{:,d}'.format(hit.count) }} artworks) + + {% if not hit.label %} + view in Wikidata {% endif %}
    • {% endfor %}