Switch to using database for browse pages.

This commit is contained in:
Edward Betts 2019-12-18 15:06:24 +00:00
parent 7313df54f7
commit b952213b23
6 changed files with 253 additions and 85 deletions

210
app.py
View file

@ -5,13 +5,17 @@ from depicts import (utils, wdqs, commons, mediawiki, artwork, database,
wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit)
from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, Item,
Language, WikidataQuery)
Language, WikidataQuery, Triple)
from depicts.error_mail import setup_error_mail
from requests_oauthlib import OAuth1Session
from werkzeug.exceptions import InternalServerError
from werkzeug.debug.tbtools import get_current_traceback
from sqlalchemy import func, distinct
from sqlalchemy.orm import aliased
from sqlalchemy.sql.expression import desc
from collections import defaultdict
from datetime import datetime
import itertools
import hashlib
import json
import os
@ -175,38 +179,28 @@ def property_query_page(property_id):
sort = request.args.get('sort')
sort_by_name = sort and sort.lower().strip() == 'name'
rows = wdqs.run_from_template_with_cache('query/property.sparql',
cache_name=pid,
pid=pid,
isa_list=isa_list)
q = (database.session.query(Triple.object_id,
func.count(func.distinct(Triple.subject_id)).label('c'))
.filter_by(predicate_id=property_id)
.join(Item, Item.item_id == Triple.subject_id)
.filter_by(is_artwork=True)
.group_by(Triple.object_id)
.order_by(desc('c')))
no_label_qid = [row['object']['value'].rpartition('/')[2]
for row in rows
if 'objectLabel' not in row and '/' in row['object']['value']]
labels = get_labels_db({f'Q{object_id}' for object_id, c in q})
if no_label_qid:
extra_label = get_labels(no_label_qid, name=f'{pid}_extra_labels')
if extra_label:
for row in rows:
item = row['object']['value']
if 'objectLabel' in row or '/' not in item:
continue
qid = item.rpartition('/')[2]
if extra_label.get(qid):
row['objectLabel'] = {'value': extra_label[qid]}
if sort_by_name:
# put rows with no English label at the end
no_label = [row for row in rows if 'objectLabel' not in row]
has_label = sorted((row for row in rows if 'objectLabel' in row),
key=lambda row: locale.strxfrm(row['objectLabel']['value']))
rows = has_label + no_label
hits = []
for object_id, count in q:
qid = f'Q{object_id}'
hits.append({'qid': qid,
'label': labels.get(qid) or '[item missing]',
'count': count})
return render_template('property.html',
label=g.title,
order=('name' if sort_by_name else 'count'),
pid=pid,
rows=rows)
hits=hits)
@app.route('/')
def start():
@ -446,7 +440,9 @@ def get_labels(keys, name=None):
if isinstance(from_cache, dict) and from_cache.get('keys') == keys:
labels = from_cache['labels']
if not labels:
for cur in utils.chunk(keys, 50):
print(len(keys))
for num, cur in enumerate(utils.chunk(keys, 50)):
print(f'{num * 50} / {len(keys)}')
labels += mediawiki.get_entities(cur, props='labels')
json.dump({'keys': keys, 'labels': labels},
@ -454,6 +450,40 @@ def get_labels(keys, name=None):
return {entity['id']: wikibase.get_entity_label(entity) for entity in labels}
def get_labels_db(keys):
keys = set(keys)
labels = {}
missing = set()
for qid in keys:
item = Item.query.get(qid[1:])
if item:
labels[qid] = item.label
else:
missing.add(qid)
print(len(missing))
page_size = 50
for num, cur in enumerate(utils.chunk(missing, page_size)):
print(f'{num * page_size} / {len(missing)}')
for entity in mediawiki.get_entities(cur):
if 'redirects' in entity:
continue
qid = entity['id']
modified = datetime.strptime(entity['modified'], "%Y-%m-%dT%H:%M:%SZ")
# FIXME: check if the item is an artwork and set is_artwork correctly
item = Item(item_id=qid[1:],
entity=entity,
lastrevid=entity['lastrevid'],
modified=modified,
is_artwork=False)
database.session.add(item)
labels[qid] = item.label
database.session.commit()
return labels
def build_other_set(entity):
other_items = set()
for key in find_more_props.keys():
@ -667,7 +697,7 @@ def catalog_page():
title=title)
def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=False):
filenames = [cur['image_filename'] for cur in items]
filenames = [cur.image_filename() for cur in items]
if thumbwidth is None:
thumbwidth = app.config['THUMBWIDTH']
@ -682,7 +712,17 @@ def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=Fals
return detail
def browse_index():
return render_template('browse_index.html', props=find_more_props)
q = (database.session.query(Triple.predicate_id,
func.count(func.distinct(Triple.object_id)))
.join(Item, Triple.subject_id == Item.item_id)
.filter_by(is_artwork=True)
.group_by(Triple.predicate_id))
counts = {f'P{predicate_id}': count for predicate_id, count in q}
return render_template('browse_index.html',
props=find_more_props,
counts=counts)
@app.route('/debug/show_user')
def debug_show_user():
@ -705,41 +745,70 @@ def browse_facets():
facets=facets,
prop_labels=find_more_props)
def get_db_items(params):
''' Get items for browse page based on criteria. '''
q = Item.query
for pid, qid in params:
q = (q.join(Triple, Item.item_id == Triple.subject_id, aliased=True)
.filter(Triple.predicate_id == pid[1:], Triple.object_id == qid[1:]))
return q
def get_db_facets(params):
t = aliased(Triple)
q = database.session.query(t.predicate_id, func.count().label('count'), t.object_id)
facet_limit = 15
for pid, qid in params:
q = (q.join(Triple, t.subject_id == Triple.subject_id, aliased=True)
.filter(Triple.predicate_id == pid[1:],
Triple.object_id == qid[1:],
t.predicate_id != pid[1:],
t.object_id != qid[1:]))
q = q.group_by(t.predicate_id, t.object_id)
results = sorted(tuple(row) for row in q.all())
facet_list = {}
subject_qids = set()
for predicate_id, x in itertools.groupby(results, lambda row: row[0]):
hits = sorted(list(x), key=lambda row: row[1], reverse=True)
values = [{'count': count, 'qid': f'Q{value}'}
for _, count, value in hits[:facet_limit]]
facet_list[f'P{predicate_id}'] = values
subject_qids.update(i['qid'] for i in values)
print(len(subject_qids))
labels = get_labels_db(subject_qids)
for values in facet_list.values():
for v in values:
v['label'] = labels[v['qid']]
return facet_list
@app.route('/browse')
def browse_page():
page_size = 45
params = get_artwork_params()
if not params:
return browse_index()
flat = '_'.join(f'{pid}={qid}' for pid, qid in params)
item_labels = get_labels(qid for pid, qid in params)
g.title = ' / '.join(find_more_props[pid] + ': ' + item_labels[qid]
item_labels = get_labels_db(qid for pid, qid in params)
g.title = ' / '.join(find_more_props[pid] + ': ' + (item_labels.get(qid) or qid)
for pid, qid in params)
bindings = filter_artwork(params)
q_items = get_db_items(params)
facets = get_db_facets(params)
try:
facets = get_facets(params)
except wdqs.QueryError:
facets = {}
page_size = 45
item_map = wdqs.build_browse_item_map(bindings)
all_items = []
for item in item_map.values():
if len(item['image_filename']) != 1:
continue
item['image_filename'] = item['image_filename'][0]
all_items.append(item)
all_items = q_items.all()
page = utils.get_int_arg('page') or 1
pager = Pagination(page, page_size, len(all_items))
total = q_items.count()
pager = Pagination(page, page_size, total)
items = pager.slice(all_items)
@ -747,29 +816,38 @@ def browse_page():
detail = get_image_detail_with_cache(items, cache_name)
cache_refreshed = False
linked_qids = {qid for pid, qid in params}
for item in items:
item['url'] = url_for('item_page', item_id=item['item_id'])
image_filename = item['image_filename']
artist_qid = item.artist
if artist_qid:
linked_qids.add(artist_qid)
for prop in 'P31', 'P180':
linked_qids.update(item.linked_qids(prop))
linked_labels = get_labels_db(linked_qids)
for item in items:
image_filename = item.image_filename()
if not cache_refreshed and image_filename not in detail:
detail = get_image_detail_with_cache(items, cache_name, refresh=True)
cache_refreshed = True
item['image'] = detail[image_filename]
item.image = detail[image_filename]
catalog_url = url_for('catalog_page', **dict(params))
return render_template('find_more.html',
facets=facets,
prop_labels=find_more_props,
return render_template('new_find_more.html',
page=page,
label=g.title,
pager=pager,
params=params,
item_map=item_map,
catalog_url=catalog_url,
page=page,
prop_labels=find_more_props,
labels=find_more_props,
bindings=bindings,
total=len(item_map),
items=items)
linked_labels=linked_labels,
items=items,
total=total,
params=params,
facets=facets)
return jsonify(params=params,
items=items.count(),
facets=facets)
@app.route('/find_more.json')
def find_more_json():

View file

@ -1,5 +1,6 @@
from sqlalchemy.ext.declarative import declarative_base
from .database import session, now_utc
from . import wikibase, utils
from sqlalchemy.schema import Column, ForeignKey
from sqlalchemy.types import Integer, String, DateTime, Boolean
from sqlalchemy.orm import column_property, relationship, synonym
@ -49,13 +50,59 @@ class DepictsItemAltLabel(Base):
class Item(Base):
__tablename__ = 'item'
item_id = Column(Integer, primary_key=True, autoincrement=False)
label = Column(String)
# label = Column(String) # column removed 2019-12-18
entity = Column(postgresql.JSON)
lastrevid = Column(Integer, nullable=True, unique=True)
modified = Column(DateTime, nullable=True)
is_artwork = Column(Boolean, nullable=False, default=False)
qid = column_property('Q' + cast(item_id, String))
def image_count(self):
p18 = self.entity['claims'].get('P18')
return len(p18) if p18 else 0
def image_filename(self):
p18 = self.entity['claims'].get('P18')
if not p18:
return
try:
return p18[0]['mainsnak']['datavalue']['value']
except KeyError:
return
@property
def label(self):
return wikibase.get_entity_label(self.entity)
@property
def artist(self):
v = wikibase.first_datavalue(self.entity, 'P170')
if not v:
return
return v['id']
@property
def depicts(self):
return self.linked_qids('P180')
@property
def instance_of(self):
return self.linked_qids('P31')
def linked_qids(self, prop):
values = self.entity['claims'].get(prop) or []
return [v['mainsnak']['datavalue']['value']['id']
for v in values
if 'datavalue' in v['mainsnak']]
@property
def date(self):
v = wikibase.first_datavalue(self.entity, 'P571')
if v:
return utils.format_time(v['time'], v['precision'])
class Triple(Base):
__tablename__ = 'triple'
subject_id = Column(Integer, primary_key=True)

View file

@ -1,5 +1,6 @@
from flask import request
from itertools import islice
from datetime import datetime
import urllib.parse
import inflect
@ -74,3 +75,25 @@ def wiki_url(title, site, ns=None):
def get_int_arg(name):
if name in request.args and request.args[name].isdigit():
return int(request.args[name])
def format_time(time_value, precision):
# FIXME handle dates like '1965-04-00T00:00:00Z'
# FIXME handle BC dates properly, "120 B.C." instead of "-120"
year = None
if '-00' in time_value:
# can't be represented as python datetime
year = int(time_value[:time_value.find('-', 1)])
else:
t = datetime.strptime(time_value[1:], "%Y-%m-%dT%H:%M:%SZ")
year = t.year
if precision == 9:
return str(year)
if precision == 8:
return f'{year}s'
if precision == 7:
return f'{ordinal((year // 100) + 1)} century'
if precision == 6:
return f'{ordinal((year // 1000) + 1)} millennium'
return time_value

View file

@ -19,7 +19,7 @@
<ul>
{% for pid, label in props.items() %}
<li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a>
({{ pid }})
({{ pid }}) &ndash; {{ '{:,d}'.format(counts[pid]) }}
</li>
{% endfor %}
</ul>

View file

@ -20,6 +20,14 @@
{% endfor %}
#}
<p>
Current filter &ndash;
{% for pid, qid in params %}
<span>{{ prop_labels[pid] }}: {{ linked_labels[qid] }}
<a href="{{ set_url_args(page=None, **{pid: None}) }}">[remove filter]</a></span>
{% endfor %}
</p>
<p>
<a href="{{ url_for('browse_page') }}">browse index</a>
{% for pid, qid in params %}
@ -31,34 +39,48 @@
<p class="mb-3">
<a href="#" id="toggle-filters" class="btn btn-primary">toggle filters</a>
{#
<a href="{{ catalog_url }}" class="btn btn-primary">catalog artwork</a>
#}
</p>
<div id="filters">
{% for key, values in facets.items() %}
<p>{{ prop_labels[key] }}:
{% for v in values %}
<a href="{{ set_url_args(**{key: v.qid}) }}">{{ v.label }}</a> ({{ v.count }})
<a href="{{ set_url_args(page=None, **{key: v.qid}) }}">{{ v.label }}</a>
({{ '{:,d}'.format(v.count) }})
{% if not loop.last %}|{% endif %}
{% endfor %}
</p>
{% endfor %}
</div>
{{ render_pagination(pager) }}
<div class="card-columns">
{% for item in items %}
{% set image = item.image %}
<div class="card">
<a href="{{ item.url }}">
<a href="{{ url_for('item_page', item_id=item.item_id) }}">
{# <img src="{{ image.thumburl }}" height="{{ image.thumbheight }}" width="{{ image.thumbwidth }}" class="card-img-top"></a> #}
<img src="{{ image.thumburl }}" class="card-img-top"></a>
<div class="card-body">
<h5 class="card-title">{{ item.label }}</h5>
<p class="card-text">by {{ item.artist_name }}
<p class="card-text">
<div>
{% for qid in item.instance_of %}
{% if not loop.first %} / {% endif %}
<span>{{ linked_labels[qid] }}</span>
{% endfor %}
</div>
{% if item.artist %}
by {{ linked_labels[item.artist] }}
{% endif %}
{% if item.date %}({{ item.date }}){% endif %}
<div>
{% for depicts_label in item.depicts %}
<span class="badge badge-primary">{{ depicts_label }}</span>
{% for depicts_qid in item.depicts %}
<span class="badge badge-primary">{{ linked_labels[depicts_qid] }}</span>
{% endfor %}
</div>
</p>

View file

@ -16,19 +16,17 @@
{% endif %}
</p>
<ul>
{% for row in rows if '/' in row.object.value %}
{% set qid = row.object.value.rpartition('/')[2] %}
{% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %}
<li>
<a href="{{ url_for('browse_page', **{pid: qid}) }}">{{ row_label }}</a>
{% if 'objectDescription' in row %}
&mdash; {{ row.objectDescription.value }}
{% endif %}
<p>Total: {{ hits | length }}</p>
({{ '{:,d}'.format(row.count.value | int) }} artworks)
{% if 'objectLabel' not in row %}
<a href="https://wikidata.org/wiki/{{ qid }}">view in Wikidata</a>
<ul>
{% for hit in hits %}
<li>
<a href="{{ url_for('browse_page', **{pid: hit.qid}) }}">{{ hit.label }}</a>
({{ '{:,d}'.format(hit.count) }} artworks)
{% if not hit.label %}
<a href="https://wikidata.org/wiki/{{ hit.qid }}">view in Wikidata</a>
{% endif %}
</li>
{% endfor %}