From b952213b23971d069bb9766de2eaa90bb4bea6b5 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 18 Dec 2019 15:06:24 +0000
Subject: [PATCH] Switch to using database for browse pages.

---
 app.py                      | 210 ++++++++++++++++++++++++------------
 depicts/model.py            |  49 ++++++++-
 depicts/utils.py            |  23 ++++
 templates/browse_index.html |   2 +-
 templates/find_more.html    |  32 +++++-
 templates/property.html     |  22 ++--
 6 files changed, 253 insertions(+), 85 deletions(-)

diff --git a/app.py b/app.py
index 45781da..fff4ecc 100755
--- a/app.py
+++ b/app.py
@@ -5,13 +5,17 @@ from depicts import (utils, wdqs, commons, mediawiki, artwork, database,
                      wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit)
 from depicts.pager import Pagination, init_pager
 from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, Item,
-                           Language, WikidataQuery)
+                           Language, WikidataQuery, Triple)
 from depicts.error_mail import setup_error_mail
 from requests_oauthlib import OAuth1Session
 from werkzeug.exceptions import InternalServerError
 from werkzeug.debug.tbtools import get_current_traceback
 from sqlalchemy import func, distinct
+from sqlalchemy.orm import aliased
+from sqlalchemy.sql.expression import desc
 from collections import defaultdict
+from datetime import datetime
+import itertools
 import hashlib
 import json
 import os
@@ -175,38 +179,28 @@ def property_query_page(property_id):
     sort = request.args.get('sort')
     sort_by_name = sort and sort.lower().strip() == 'name'
 
-    rows = wdqs.run_from_template_with_cache('query/property.sparql',
-                                             cache_name=pid,
-                                             pid=pid,
-                                             isa_list=isa_list)
+    q = (database.session.query(Triple.object_id,
+                                func.count(func.distinct(Triple.subject_id)).label('c'))
+                         .filter_by(predicate_id=property_id)
+                         .join(Item, Item.item_id == Triple.subject_id)
+                         .filter_by(is_artwork=True)
+                         .group_by(Triple.object_id)
+                         .order_by(desc('c')))
 
-    no_label_qid = [row['object']['value'].rpartition('/')[2]
-                    for row in rows
-                    if 'objectLabel' not in row and '/' in row['object']['value']]
+    labels = get_labels_db({f'Q{object_id}' for object_id, c in q})
 
-    if no_label_qid:
-        extra_label = get_labels(no_label_qid, name=f'{pid}_extra_labels')
-        if extra_label:
-            for row in rows:
-                item = row['object']['value']
-                if 'objectLabel' in row or '/' not in item:
-                    continue
-                qid = item.rpartition('/')[2]
-                if extra_label.get(qid):
-                    row['objectLabel'] = {'value': extra_label[qid]}
-
-    if sort_by_name:
-        # put rows with no English label at the end
-        no_label = [row for row in rows if 'objectLabel' not in row]
-        has_label = sorted((row for row in rows if 'objectLabel' in row),
-                            key=lambda row: locale.strxfrm(row['objectLabel']['value']))
-        rows = has_label + no_label
+    hits = []
+    for object_id, count in q:
+        qid = f'Q{object_id}'
+        hits.append({'qid': qid,
+                     'label': labels.get(qid) or '[item missing]',
+                     'count': count})
 
     return render_template('property.html',
                            label=g.title,
                            order=('name' if sort_by_name else 'count'),
                            pid=pid,
-                           rows=rows)
+                           hits=hits)
 
 @app.route('/')
 def start():
@@ -446,7 +440,9 @@ def get_labels(keys, name=None):
         if isinstance(from_cache, dict) and from_cache.get('keys') == keys:
             labels = from_cache['labels']
     if not labels:
-        for cur in utils.chunk(keys, 50):
+        print(len(keys))
+        for num, cur in enumerate(utils.chunk(keys, 50)):
+            print(f'{num * 50} / {len(keys)}')
             labels += mediawiki.get_entities(cur, props='labels')
 
         json.dump({'keys': keys, 'labels': labels},
@@ -454,6 +450,40 @@ def get_labels(keys, name=None):
 
     return {entity['id']: wikibase.get_entity_label(entity) for entity in labels}
 
+def get_labels_db(keys):
+    keys = set(keys)
+    labels = {}
+    missing = set()
+    for qid in keys:
+        item = Item.query.get(qid[1:])
+        if item:
+            labels[qid] = item.label
+        else:
+            missing.add(qid)
+
+    print(len(missing))
+    page_size = 50
+    for num, cur in enumerate(utils.chunk(missing, page_size)):
+        print(f'{num * page_size} / {len(missing)}')
+        for entity in mediawiki.get_entities(cur):
+            if 'redirects' in entity:
+                continue
+
+            qid = entity['id']
+
+            modified = datetime.strptime(entity['modified'], "%Y-%m-%dT%H:%M:%SZ")
+            # FIXME: check if the item is an artwork and set is_artwork correctly
+            item = Item(item_id=qid[1:],
+                        entity=entity,
+                        lastrevid=entity['lastrevid'],
+                        modified=modified,
+                        is_artwork=False)
+            database.session.add(item)
+            labels[qid] = item.label
+        database.session.commit()
+
+    return labels
+
 def build_other_set(entity):
     other_items = set()
     for key in find_more_props.keys():
@@ -667,7 +697,7 @@ def catalog_page():
                            title=title)
 
 def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=False):
-    filenames = [cur['image_filename'] for cur in items]
+    filenames = [cur.image_filename() for cur in items]
 
     if thumbwidth is None:
         thumbwidth = app.config['THUMBWIDTH']
@@ -682,7 +712,17 @@ def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=Fals
     return detail
 
 def browse_index():
-    return render_template('browse_index.html', props=find_more_props)
+    q = (database.session.query(Triple.predicate_id,
+                                func.count(func.distinct(Triple.object_id)))
+                         .join(Item, Triple.subject_id == Item.item_id)
+                         .filter_by(is_artwork=True)
+                         .group_by(Triple.predicate_id))
+
+    counts = {f'P{predicate_id}': count for predicate_id, count in q}
+
+    return render_template('browse_index.html',
+                           props=find_more_props,
+                           counts=counts)
 
 @app.route('/debug/show_user')
 def debug_show_user():
@@ -705,41 +745,70 @@ def browse_facets():
                    facets=facets,
                    prop_labels=find_more_props)
 
+def get_db_items(params):
+    ''' Get items for browse page based on criteria. '''
+    q = Item.query
+    for pid, qid in params:
+        q = (q.join(Triple, Item.item_id == Triple.subject_id, aliased=True)
+              .filter(Triple.predicate_id == pid[1:], Triple.object_id == qid[1:]))
+
+    return q
+
+def get_db_facets(params):
+    t = aliased(Triple)
+    q = database.session.query(t.predicate_id, func.count().label('count'), t.object_id)
+    facet_limit = 15
+
+    for pid, qid in params:
+        q = (q.join(Triple, t.subject_id == Triple.subject_id, aliased=True)
+              .filter(Triple.predicate_id == pid[1:],
+                      Triple.object_id == qid[1:],
+                      t.predicate_id != pid[1:],
+                      t.object_id != qid[1:]))
+
+    q = q.group_by(t.predicate_id, t.object_id)
+
+    results = sorted(tuple(row) for row in q.all())
+
+    facet_list = {}
+    subject_qids = set()
+    for predicate_id, x in itertools.groupby(results, lambda row: row[0]):
+        hits = sorted(list(x), key=lambda row: row[1], reverse=True)
+        values = [{'count': count, 'qid': f'Q{value}'}
+                  for _, count, value in hits[:facet_limit]]
+        facet_list[f'P{predicate_id}'] = values
+        subject_qids.update(i['qid'] for i in values)
+
+    print(len(subject_qids))
+    labels = get_labels_db(subject_qids)
+
+    for values in facet_list.values():
+        for v in values:
+            v['label'] = labels[v['qid']]
+
+    return facet_list
+
 @app.route('/browse')
 def browse_page():
+    page_size = 45
     params = get_artwork_params()
 
     if not params:
         return browse_index()
 
     flat = '_'.join(f'{pid}={qid}' for pid, qid in params)
-
-    item_labels = get_labels(qid for pid, qid in params)
-
-    g.title = ' / '.join(find_more_props[pid] + ': ' + item_labels[qid]
+    item_labels = get_labels_db(qid for pid, qid in params)
+    g.title = ' / '.join(find_more_props[pid] + ': ' + (item_labels.get(qid) or qid)
                          for pid, qid in params)
 
-    bindings = filter_artwork(params)
+    q_items = get_db_items(params)
+    facets = get_db_facets(params)
 
-    try:
-        facets = get_facets(params)
-    except wdqs.QueryError:
-        facets = {}
-
-
-    page_size = 45
-
-    item_map = wdqs.build_browse_item_map(bindings)
-
-    all_items = []
-    for item in item_map.values():
-        if len(item['image_filename']) != 1:
-            continue
-        item['image_filename'] = item['image_filename'][0]
-        all_items.append(item)
+    all_items = q_items.all()
 
     page = utils.get_int_arg('page') or 1
-    pager = Pagination(page, page_size, len(all_items))
+    total = q_items.count()
+    pager = Pagination(page, page_size, total)
 
     items = pager.slice(all_items)
 
@@ -747,29 +816,38 @@ def browse_page():
     detail = get_image_detail_with_cache(items, cache_name)
     cache_refreshed = False
 
+    linked_qids = {qid for pid, qid in params}
     for item in items:
-        item['url'] = url_for('item_page', item_id=item['item_id'])
-        image_filename = item['image_filename']
+        artist_qid = item.artist
+        if artist_qid:
+            linked_qids.add(artist_qid)
+        for prop in 'P31', 'P180':
+            linked_qids.update(item.linked_qids(prop))
+
+    linked_labels = get_labels_db(linked_qids)
+
+    for item in items:
+        image_filename = item.image_filename()
         if not cache_refreshed and image_filename not in detail:
             detail = get_image_detail_with_cache(items, cache_name, refresh=True)
             cache_refreshed = True
-        item['image'] = detail[image_filename]
+        item.image = detail[image_filename]
 
-    catalog_url = url_for('catalog_page', **dict(params))
-
-    return render_template('find_more.html',
-                           facets=facets,
-                           prop_labels=find_more_props,
+    return render_template('new_find_more.html',
+                           page=page,
                            label=g.title,
                            pager=pager,
-                           params=params,
-                           item_map=item_map,
-                           catalog_url=catalog_url,
-                           page=page,
+                           prop_labels=find_more_props,
                            labels=find_more_props,
-                           bindings=bindings,
-                           total=len(item_map),
-                           items=items)
+                           linked_labels=linked_labels,
+                           items=items,
+                           total=total,
+                           params=params,
+                           facets=facets)
+
+    return jsonify(params=params,
+                   items=items.count(),
+                   facets=facets)
 
 @app.route('/find_more.json')
 def find_more_json():
diff --git a/depicts/model.py b/depicts/model.py
index c1d5f97..b23c3c1 100644
--- a/depicts/model.py
+++ b/depicts/model.py
@@ -1,5 +1,6 @@
 from sqlalchemy.ext.declarative import declarative_base
 from .database import session, now_utc
+from . import wikibase, utils
 from sqlalchemy.schema import Column, ForeignKey
 from sqlalchemy.types import Integer, String, DateTime, Boolean
 from sqlalchemy.orm import column_property, relationship, synonym
@@ -49,13 +50,59 @@ class DepictsItemAltLabel(Base):
 class Item(Base):
     __tablename__ = 'item'
     item_id = Column(Integer, primary_key=True, autoincrement=False)
-    label = Column(String)
+    # label = Column(String)  # column removed 2019-12-18
     entity = Column(postgresql.JSON)
     lastrevid = Column(Integer, nullable=True, unique=True)
     modified = Column(DateTime, nullable=True)
     is_artwork = Column(Boolean, nullable=False, default=False)
     qid = column_property('Q' + cast(item_id, String))
 
+    def image_count(self):
+        p18 = self.entity['claims'].get('P18')
+        return len(p18) if p18 else 0
+
+    def image_filename(self):
+        p18 = self.entity['claims'].get('P18')
+        if not p18:
+            return
+
+        try:
+            return p18[0]['mainsnak']['datavalue']['value']
+        except KeyError:
+            return
+
+    @property
+    def label(self):
+        return wikibase.get_entity_label(self.entity)
+
+    @property
+    def artist(self):
+        v = wikibase.first_datavalue(self.entity, 'P170')
+        if not v:
+            return
+        return v['id']
+
+    @property
+    def depicts(self):
+        return self.linked_qids('P180')
+
+    @property
+    def instance_of(self):
+        return self.linked_qids('P31')
+
+    def linked_qids(self, prop):
+        values = self.entity['claims'].get(prop) or []
+        return [v['mainsnak']['datavalue']['value']['id']
+                for v in values
+                if 'datavalue' in v['mainsnak']]
+
+
+    @property
+    def date(self):
+        v = wikibase.first_datavalue(self.entity, 'P571')
+        if v:
+            return utils.format_time(v['time'], v['precision'])
+
 class Triple(Base):
     __tablename__ = 'triple'
     subject_id = Column(Integer, primary_key=True)
diff --git a/depicts/utils.py b/depicts/utils.py
index 132386f..48c3ae7 100644
--- a/depicts/utils.py
+++ b/depicts/utils.py
@@ -1,5 +1,6 @@
 from flask import request
 from itertools import islice
+from datetime import datetime
 import urllib.parse
 import inflect
 
@@ -74,3 +75,25 @@ def wiki_url(title, site, ns=None):
 def get_int_arg(name):
     if name in request.args and request.args[name].isdigit():
         return int(request.args[name])
+
+def format_time(time_value, precision):
+    # FIXME handle dates like '1965-04-00T00:00:00Z'
+    # FIXME handle BC dates properly, "120 B.C." instead of "-120"
+    year = None
+    if '-00' in time_value:
+        # can't be represented as python datetime
+        year = int(time_value[:time_value.find('-', 1)])
+    else:
+        t = datetime.strptime(time_value[1:], "%Y-%m-%dT%H:%M:%SZ")
+        year = t.year
+
+    if precision == 9:
+        return str(year)
+    if precision == 8:
+        return f'{year}s'
+    if precision == 7:
+        return f'{ordinal((year // 100) + 1)} century'
+    if precision == 6:
+        return f'{ordinal((year // 1000) + 1)} millennium'
+
+    return time_value
diff --git a/templates/browse_index.html b/templates/browse_index.html
index 98c87b7..019fb3d 100644
--- a/templates/browse_index.html
+++ b/templates/browse_index.html
@@ -19,7 +19,7 @@
   <ul>
   {% for pid, label in props.items() %}
     <li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a>
-    ({{ pid }})
+      ({{ pid }}) &ndash; {{ '{:,d}'.format(counts[pid]) }}
     </li>
   {% endfor %}
   </ul>
diff --git a/templates/find_more.html b/templates/find_more.html
index fc5bcb4..5d23b89 100644
--- a/templates/find_more.html
+++ b/templates/find_more.html
@@ -20,6 +20,14 @@
   {% endfor %}
   #}
 
+  <p>
+    Current filter &ndash;
+    {% for pid, qid in params %}
+      <span>{{ prop_labels[pid] }}: {{ linked_labels[qid] }}
+        <a href="{{ set_url_args(page=None, **{pid: None}) }}">[remove filter]</a></span>
+    {% endfor %}
+  </p>
+
   <p>
     <a href="{{ url_for('browse_page') }}">browse index</a>
     {% for pid, qid in params %}
@@ -31,34 +39,48 @@
 
 <p class="mb-3">
   <a href="#" id="toggle-filters" class="btn btn-primary">toggle filters</a>
+  {#
   <a href="{{ catalog_url }}" class="btn btn-primary">catalog artwork</a>
+  #}
 </p>
 
   <div id="filters">
   {% for key, values in facets.items() %}
     <p>{{ prop_labels[key] }}:
       {% for v in values %}
-        <a href="{{ set_url_args(**{key: v.qid}) }}">{{ v.label }}</a> ({{ v.count }})
+        <a href="{{ set_url_args(page=None, **{key: v.qid}) }}">{{ v.label }}</a>
+        ({{ '{:,d}'.format(v.count) }})
         {% if not loop.last %}|{% endif %}
       {% endfor %}
     </p>
   {% endfor %}
   </div>
 
+{{ render_pagination(pager) }}
+
 <div class="card-columns">
   {% for item in items %}
   {% set image = item.image %}
   <div class="card">
-    <a href="{{ item.url }}">
+    <a href="{{ url_for('item_page', item_id=item.item_id) }}">
       {# <img src="{{ image.thumburl }}" height="{{ image.thumbheight }}" width="{{ image.thumbwidth }}" class="card-img-top"></a> #}
       <img src="{{ image.thumburl }}" class="card-img-top"></a>
     <div class="card-body">
       <h5 class="card-title">{{ item.label }}</h5>
-      <p class="card-text">by {{ item.artist_name }}
+      <p class="card-text">
+       <div>
+       {% for qid in item.instance_of %}
+         {% if not loop.first %} / {% endif %}
+         <span>{{ linked_labels[qid] }}</span>
+       {% endfor %}
+       </div>
+      {% if item.artist %}
+        by {{ linked_labels[item.artist] }}
+      {% endif %}
       {% if item.date %}({{ item.date }}){% endif %}
         <div>
-        {% for depicts_label in item.depicts %}
-          <span class="badge badge-primary">{{ depicts_label }}</span>
+        {% for depicts_qid in item.depicts %}
+          <span class="badge badge-primary">{{ linked_labels[depicts_qid] }}</span>
         {% endfor %}
         </div>
       </p>
diff --git a/templates/property.html b/templates/property.html
index 40ecbcc..19208d3 100644
--- a/templates/property.html
+++ b/templates/property.html
@@ -16,19 +16,17 @@
   {% endif %}
   </p>
 
-  <ul>
-  {% for row in rows if '/' in row.object.value %}
-    {% set qid = row.object.value.rpartition('/')[2] %}
-    {% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %}
-    <li>
-      <a href="{{ url_for('browse_page', **{pid: qid}) }}">{{ row_label }}</a>
-    {% if 'objectDescription' in row %}
-    &mdash; {{ row.objectDescription.value }}
-    {% endif %}
+  <p>Total: {{ hits | length }}</p>
 
-    ({{ '{:,d}'.format(row.count.value | int) }} artworks)
-    {% if 'objectLabel' not in row %}
-      <a href="https://wikidata.org/wiki/{{ qid }}">view in Wikidata</a>
+  <ul>
+  {% for hit in hits %}
+    <li>
+      <a href="{{ url_for('browse_page', **{pid: hit.qid}) }}">{{ hit.label }}</a>
+
+    ({{ '{:,d}'.format(hit.count) }} artworks)
+
+    {% if not hit.label %}
+      <a href="https://wikidata.org/wiki/{{ hit.qid }}">view in Wikidata</a>
     {% endif %}
     </li>
   {% endfor %}