Switch to using database for browse pages.

This commit is contained in:
Edward Betts 2019-12-18 15:06:24 +00:00
parent 7313df54f7
commit b952213b23
6 changed files with 253 additions and 85 deletions

210
app.py
View file

@ -5,13 +5,17 @@ from depicts import (utils, wdqs, commons, mediawiki, artwork, database,
wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit) wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit)
from depicts.pager import Pagination, init_pager from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, Item, from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, Item,
Language, WikidataQuery) Language, WikidataQuery, Triple)
from depicts.error_mail import setup_error_mail from depicts.error_mail import setup_error_mail
from requests_oauthlib import OAuth1Session from requests_oauthlib import OAuth1Session
from werkzeug.exceptions import InternalServerError from werkzeug.exceptions import InternalServerError
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from sqlalchemy import func, distinct from sqlalchemy import func, distinct
from sqlalchemy.orm import aliased
from sqlalchemy.sql.expression import desc
from collections import defaultdict from collections import defaultdict
from datetime import datetime
import itertools
import hashlib import hashlib
import json import json
import os import os
@ -175,38 +179,28 @@ def property_query_page(property_id):
sort = request.args.get('sort') sort = request.args.get('sort')
sort_by_name = sort and sort.lower().strip() == 'name' sort_by_name = sort and sort.lower().strip() == 'name'
rows = wdqs.run_from_template_with_cache('query/property.sparql', q = (database.session.query(Triple.object_id,
cache_name=pid, func.count(func.distinct(Triple.subject_id)).label('c'))
pid=pid, .filter_by(predicate_id=property_id)
isa_list=isa_list) .join(Item, Item.item_id == Triple.subject_id)
.filter_by(is_artwork=True)
.group_by(Triple.object_id)
.order_by(desc('c')))
no_label_qid = [row['object']['value'].rpartition('/')[2] labels = get_labels_db({f'Q{object_id}' for object_id, c in q})
for row in rows
if 'objectLabel' not in row and '/' in row['object']['value']]
if no_label_qid: hits = []
extra_label = get_labels(no_label_qid, name=f'{pid}_extra_labels') for object_id, count in q:
if extra_label: qid = f'Q{object_id}'
for row in rows: hits.append({'qid': qid,
item = row['object']['value'] 'label': labels.get(qid) or '[item missing]',
if 'objectLabel' in row or '/' not in item: 'count': count})
continue
qid = item.rpartition('/')[2]
if extra_label.get(qid):
row['objectLabel'] = {'value': extra_label[qid]}
if sort_by_name:
# put rows with no English label at the end
no_label = [row for row in rows if 'objectLabel' not in row]
has_label = sorted((row for row in rows if 'objectLabel' in row),
key=lambda row: locale.strxfrm(row['objectLabel']['value']))
rows = has_label + no_label
return render_template('property.html', return render_template('property.html',
label=g.title, label=g.title,
order=('name' if sort_by_name else 'count'), order=('name' if sort_by_name else 'count'),
pid=pid, pid=pid,
rows=rows) hits=hits)
@app.route('/') @app.route('/')
def start(): def start():
@ -446,7 +440,9 @@ def get_labels(keys, name=None):
if isinstance(from_cache, dict) and from_cache.get('keys') == keys: if isinstance(from_cache, dict) and from_cache.get('keys') == keys:
labels = from_cache['labels'] labels = from_cache['labels']
if not labels: if not labels:
for cur in utils.chunk(keys, 50): print(len(keys))
for num, cur in enumerate(utils.chunk(keys, 50)):
print(f'{num * 50} / {len(keys)}')
labels += mediawiki.get_entities(cur, props='labels') labels += mediawiki.get_entities(cur, props='labels')
json.dump({'keys': keys, 'labels': labels}, json.dump({'keys': keys, 'labels': labels},
@ -454,6 +450,40 @@ def get_labels(keys, name=None):
return {entity['id']: wikibase.get_entity_label(entity) for entity in labels} return {entity['id']: wikibase.get_entity_label(entity) for entity in labels}
def get_labels_db(keys):
keys = set(keys)
labels = {}
missing = set()
for qid in keys:
item = Item.query.get(qid[1:])
if item:
labels[qid] = item.label
else:
missing.add(qid)
print(len(missing))
page_size = 50
for num, cur in enumerate(utils.chunk(missing, page_size)):
print(f'{num * page_size} / {len(missing)}')
for entity in mediawiki.get_entities(cur):
if 'redirects' in entity:
continue
qid = entity['id']
modified = datetime.strptime(entity['modified'], "%Y-%m-%dT%H:%M:%SZ")
# FIXME: check if the item is an artwork and set is_artwork correctly
item = Item(item_id=qid[1:],
entity=entity,
lastrevid=entity['lastrevid'],
modified=modified,
is_artwork=False)
database.session.add(item)
labels[qid] = item.label
database.session.commit()
return labels
def build_other_set(entity): def build_other_set(entity):
other_items = set() other_items = set()
for key in find_more_props.keys(): for key in find_more_props.keys():
@ -667,7 +697,7 @@ def catalog_page():
title=title) title=title)
def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=False): def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=False):
filenames = [cur['image_filename'] for cur in items] filenames = [cur.image_filename() for cur in items]
if thumbwidth is None: if thumbwidth is None:
thumbwidth = app.config['THUMBWIDTH'] thumbwidth = app.config['THUMBWIDTH']
@ -682,7 +712,17 @@ def get_image_detail_with_cache(items, cache_name, thumbwidth=None, refresh=Fals
return detail return detail
def browse_index(): def browse_index():
return render_template('browse_index.html', props=find_more_props) q = (database.session.query(Triple.predicate_id,
func.count(func.distinct(Triple.object_id)))
.join(Item, Triple.subject_id == Item.item_id)
.filter_by(is_artwork=True)
.group_by(Triple.predicate_id))
counts = {f'P{predicate_id}': count for predicate_id, count in q}
return render_template('browse_index.html',
props=find_more_props,
counts=counts)
@app.route('/debug/show_user') @app.route('/debug/show_user')
def debug_show_user(): def debug_show_user():
@ -705,41 +745,70 @@ def browse_facets():
facets=facets, facets=facets,
prop_labels=find_more_props) prop_labels=find_more_props)
def get_db_items(params):
''' Get items for browse page based on criteria. '''
q = Item.query
for pid, qid in params:
q = (q.join(Triple, Item.item_id == Triple.subject_id, aliased=True)
.filter(Triple.predicate_id == pid[1:], Triple.object_id == qid[1:]))
return q
def get_db_facets(params):
t = aliased(Triple)
q = database.session.query(t.predicate_id, func.count().label('count'), t.object_id)
facet_limit = 15
for pid, qid in params:
q = (q.join(Triple, t.subject_id == Triple.subject_id, aliased=True)
.filter(Triple.predicate_id == pid[1:],
Triple.object_id == qid[1:],
t.predicate_id != pid[1:],
t.object_id != qid[1:]))
q = q.group_by(t.predicate_id, t.object_id)
results = sorted(tuple(row) for row in q.all())
facet_list = {}
subject_qids = set()
for predicate_id, x in itertools.groupby(results, lambda row: row[0]):
hits = sorted(list(x), key=lambda row: row[1], reverse=True)
values = [{'count': count, 'qid': f'Q{value}'}
for _, count, value in hits[:facet_limit]]
facet_list[f'P{predicate_id}'] = values
subject_qids.update(i['qid'] for i in values)
print(len(subject_qids))
labels = get_labels_db(subject_qids)
for values in facet_list.values():
for v in values:
v['label'] = labels[v['qid']]
return facet_list
@app.route('/browse') @app.route('/browse')
def browse_page(): def browse_page():
page_size = 45
params = get_artwork_params() params = get_artwork_params()
if not params: if not params:
return browse_index() return browse_index()
flat = '_'.join(f'{pid}={qid}' for pid, qid in params) flat = '_'.join(f'{pid}={qid}' for pid, qid in params)
item_labels = get_labels_db(qid for pid, qid in params)
item_labels = get_labels(qid for pid, qid in params) g.title = ' / '.join(find_more_props[pid] + ': ' + (item_labels.get(qid) or qid)
g.title = ' / '.join(find_more_props[pid] + ': ' + item_labels[qid]
for pid, qid in params) for pid, qid in params)
bindings = filter_artwork(params) q_items = get_db_items(params)
facets = get_db_facets(params)
try: all_items = q_items.all()
facets = get_facets(params)
except wdqs.QueryError:
facets = {}
page_size = 45
item_map = wdqs.build_browse_item_map(bindings)
all_items = []
for item in item_map.values():
if len(item['image_filename']) != 1:
continue
item['image_filename'] = item['image_filename'][0]
all_items.append(item)
page = utils.get_int_arg('page') or 1 page = utils.get_int_arg('page') or 1
pager = Pagination(page, page_size, len(all_items)) total = q_items.count()
pager = Pagination(page, page_size, total)
items = pager.slice(all_items) items = pager.slice(all_items)
@ -747,29 +816,38 @@ def browse_page():
detail = get_image_detail_with_cache(items, cache_name) detail = get_image_detail_with_cache(items, cache_name)
cache_refreshed = False cache_refreshed = False
linked_qids = {qid for pid, qid in params}
for item in items: for item in items:
item['url'] = url_for('item_page', item_id=item['item_id']) artist_qid = item.artist
image_filename = item['image_filename'] if artist_qid:
linked_qids.add(artist_qid)
for prop in 'P31', 'P180':
linked_qids.update(item.linked_qids(prop))
linked_labels = get_labels_db(linked_qids)
for item in items:
image_filename = item.image_filename()
if not cache_refreshed and image_filename not in detail: if not cache_refreshed and image_filename not in detail:
detail = get_image_detail_with_cache(items, cache_name, refresh=True) detail = get_image_detail_with_cache(items, cache_name, refresh=True)
cache_refreshed = True cache_refreshed = True
item['image'] = detail[image_filename] item.image = detail[image_filename]
catalog_url = url_for('catalog_page', **dict(params)) return render_template('new_find_more.html',
page=page,
return render_template('find_more.html',
facets=facets,
prop_labels=find_more_props,
label=g.title, label=g.title,
pager=pager, pager=pager,
params=params, prop_labels=find_more_props,
item_map=item_map,
catalog_url=catalog_url,
page=page,
labels=find_more_props, labels=find_more_props,
bindings=bindings, linked_labels=linked_labels,
total=len(item_map), items=items,
items=items) total=total,
params=params,
facets=facets)
return jsonify(params=params,
items=items.count(),
facets=facets)
@app.route('/find_more.json') @app.route('/find_more.json')
def find_more_json(): def find_more_json():

View file

@ -1,5 +1,6 @@
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from .database import session, now_utc from .database import session, now_utc
from . import wikibase, utils
from sqlalchemy.schema import Column, ForeignKey from sqlalchemy.schema import Column, ForeignKey
from sqlalchemy.types import Integer, String, DateTime, Boolean from sqlalchemy.types import Integer, String, DateTime, Boolean
from sqlalchemy.orm import column_property, relationship, synonym from sqlalchemy.orm import column_property, relationship, synonym
@ -49,13 +50,59 @@ class DepictsItemAltLabel(Base):
class Item(Base): class Item(Base):
__tablename__ = 'item' __tablename__ = 'item'
item_id = Column(Integer, primary_key=True, autoincrement=False) item_id = Column(Integer, primary_key=True, autoincrement=False)
label = Column(String) # label = Column(String) # column removed 2019-12-18
entity = Column(postgresql.JSON) entity = Column(postgresql.JSON)
lastrevid = Column(Integer, nullable=True, unique=True) lastrevid = Column(Integer, nullable=True, unique=True)
modified = Column(DateTime, nullable=True) modified = Column(DateTime, nullable=True)
is_artwork = Column(Boolean, nullable=False, default=False) is_artwork = Column(Boolean, nullable=False, default=False)
qid = column_property('Q' + cast(item_id, String)) qid = column_property('Q' + cast(item_id, String))
def image_count(self):
p18 = self.entity['claims'].get('P18')
return len(p18) if p18 else 0
def image_filename(self):
p18 = self.entity['claims'].get('P18')
if not p18:
return
try:
return p18[0]['mainsnak']['datavalue']['value']
except KeyError:
return
@property
def label(self):
return wikibase.get_entity_label(self.entity)
@property
def artist(self):
v = wikibase.first_datavalue(self.entity, 'P170')
if not v:
return
return v['id']
@property
def depicts(self):
return self.linked_qids('P180')
@property
def instance_of(self):
return self.linked_qids('P31')
def linked_qids(self, prop):
values = self.entity['claims'].get(prop) or []
return [v['mainsnak']['datavalue']['value']['id']
for v in values
if 'datavalue' in v['mainsnak']]
@property
def date(self):
v = wikibase.first_datavalue(self.entity, 'P571')
if v:
return utils.format_time(v['time'], v['precision'])
class Triple(Base): class Triple(Base):
__tablename__ = 'triple' __tablename__ = 'triple'
subject_id = Column(Integer, primary_key=True) subject_id = Column(Integer, primary_key=True)

View file

@ -1,5 +1,6 @@
from flask import request from flask import request
from itertools import islice from itertools import islice
from datetime import datetime
import urllib.parse import urllib.parse
import inflect import inflect
@ -74,3 +75,25 @@ def wiki_url(title, site, ns=None):
def get_int_arg(name): def get_int_arg(name):
if name in request.args and request.args[name].isdigit(): if name in request.args and request.args[name].isdigit():
return int(request.args[name]) return int(request.args[name])
def format_time(time_value, precision):
# FIXME handle dates like '1965-04-00T00:00:00Z'
# FIXME handle BC dates properly, "120 B.C." instead of "-120"
year = None
if '-00' in time_value:
# can't be represented as python datetime
year = int(time_value[:time_value.find('-', 1)])
else:
t = datetime.strptime(time_value[1:], "%Y-%m-%dT%H:%M:%SZ")
year = t.year
if precision == 9:
return str(year)
if precision == 8:
return f'{year}s'
if precision == 7:
return f'{ordinal((year // 100) + 1)} century'
if precision == 6:
return f'{ordinal((year // 1000) + 1)} millennium'
return time_value

View file

@ -19,7 +19,7 @@
<ul> <ul>
{% for pid, label in props.items() %} {% for pid, label in props.items() %}
<li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a> <li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a>
({{ pid }}) ({{ pid }}) &ndash; {{ '{:,d}'.format(counts[pid]) }}
</li> </li>
{% endfor %} {% endfor %}
</ul> </ul>

View file

@ -20,6 +20,14 @@
{% endfor %} {% endfor %}
#} #}
<p>
Current filter &ndash;
{% for pid, qid in params %}
<span>{{ prop_labels[pid] }}: {{ linked_labels[qid] }}
<a href="{{ set_url_args(page=None, **{pid: None}) }}">[remove filter]</a></span>
{% endfor %}
</p>
<p> <p>
<a href="{{ url_for('browse_page') }}">browse index</a> <a href="{{ url_for('browse_page') }}">browse index</a>
{% for pid, qid in params %} {% for pid, qid in params %}
@ -31,34 +39,48 @@
<p class="mb-3"> <p class="mb-3">
<a href="#" id="toggle-filters" class="btn btn-primary">toggle filters</a> <a href="#" id="toggle-filters" class="btn btn-primary">toggle filters</a>
{#
<a href="{{ catalog_url }}" class="btn btn-primary">catalog artwork</a> <a href="{{ catalog_url }}" class="btn btn-primary">catalog artwork</a>
#}
</p> </p>
<div id="filters"> <div id="filters">
{% for key, values in facets.items() %} {% for key, values in facets.items() %}
<p>{{ prop_labels[key] }}: <p>{{ prop_labels[key] }}:
{% for v in values %} {% for v in values %}
<a href="{{ set_url_args(**{key: v.qid}) }}">{{ v.label }}</a> ({{ v.count }}) <a href="{{ set_url_args(page=None, **{key: v.qid}) }}">{{ v.label }}</a>
({{ '{:,d}'.format(v.count) }})
{% if not loop.last %}|{% endif %} {% if not loop.last %}|{% endif %}
{% endfor %} {% endfor %}
</p> </p>
{% endfor %} {% endfor %}
</div> </div>
{{ render_pagination(pager) }}
<div class="card-columns"> <div class="card-columns">
{% for item in items %} {% for item in items %}
{% set image = item.image %} {% set image = item.image %}
<div class="card"> <div class="card">
<a href="{{ item.url }}"> <a href="{{ url_for('item_page', item_id=item.item_id) }}">
{# <img src="{{ image.thumburl }}" height="{{ image.thumbheight }}" width="{{ image.thumbwidth }}" class="card-img-top"></a> #} {# <img src="{{ image.thumburl }}" height="{{ image.thumbheight }}" width="{{ image.thumbwidth }}" class="card-img-top"></a> #}
<img src="{{ image.thumburl }}" class="card-img-top"></a> <img src="{{ image.thumburl }}" class="card-img-top"></a>
<div class="card-body"> <div class="card-body">
<h5 class="card-title">{{ item.label }}</h5> <h5 class="card-title">{{ item.label }}</h5>
<p class="card-text">by {{ item.artist_name }} <p class="card-text">
<div>
{% for qid in item.instance_of %}
{% if not loop.first %} / {% endif %}
<span>{{ linked_labels[qid] }}</span>
{% endfor %}
</div>
{% if item.artist %}
by {{ linked_labels[item.artist] }}
{% endif %}
{% if item.date %}({{ item.date }}){% endif %} {% if item.date %}({{ item.date }}){% endif %}
<div> <div>
{% for depicts_label in item.depicts %} {% for depicts_qid in item.depicts %}
<span class="badge badge-primary">{{ depicts_label }}</span> <span class="badge badge-primary">{{ linked_labels[depicts_qid] }}</span>
{% endfor %} {% endfor %}
</div> </div>
</p> </p>

View file

@ -16,19 +16,17 @@
{% endif %} {% endif %}
</p> </p>
<ul> <p>Total: {{ hits | length }}</p>
{% for row in rows if '/' in row.object.value %}
{% set qid = row.object.value.rpartition('/')[2] %}
{% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %}
<li>
<a href="{{ url_for('browse_page', **{pid: qid}) }}">{{ row_label }}</a>
{% if 'objectDescription' in row %}
&mdash; {{ row.objectDescription.value }}
{% endif %}
({{ '{:,d}'.format(row.count.value | int) }} artworks) <ul>
{% if 'objectLabel' not in row %} {% for hit in hits %}
<a href="https://wikidata.org/wiki/{{ qid }}">view in Wikidata</a> <li>
<a href="{{ url_for('browse_page', **{pid: hit.qid}) }}">{{ hit.label }}</a>
({{ '{:,d}'.format(hit.count) }} artworks)
{% if not hit.label %}
<a href="https://wikidata.org/wiki/{{ hit.qid }}">view in Wikidata</a>
{% endif %} {% endif %}
</li> </li>
{% endfor %} {% endfor %}