Refactor

Split up code Add depicts guessing Redirect for random image
2019-09-16 08:59:53 +01:00 · 2019-09-16 08:59:53 +01:00 · a909b50329
parent 0719f441c7
commit a909b50329
10 changed files with 706 additions and 177 deletions
--- a/app.py
+++ b/app.py
@ -1,23 +1,15 @@
 #!/usr/bin/python3

-from flask import Flask, render_template, url_for, redirect, request
-from depicts import utils
-import dateutil.parser
-import urllib.parse
-import requests
+from flask import Flask, render_template, url_for, redirect, request, g
+from depicts import utils, wdqs, commons, mediawiki, painting
 import json
 import os
 import locale
+import random

 locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

-url_start = 'http://www.wikidata.org/entity/Q'
-wikidata_url = 'https://www.wikidata.org/w/api.php'
-commons_url = 'https://www.wikidata.org/w/api.php'
-wikidata_query_api_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
-commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
 thumbwidth = 300
-thumbheight = 400

 app = Flask(__name__)

@ -98,68 +90,30 @@ select ?object ?objectLabel ?objectDescription (count(*) as ?count) {
 order by desc(?count)
 '''

-def run_wikidata_query(query):
-    params = {'query': query, 'format': 'json'}
-    r = requests.post(wikidata_query_api_url, data=params, stream=True)
-    assert r.status_code == 200
-    return r
+painting_no_depicts_query = '''
+select distinct ?item where {
+  ?item wdt:P31 wd:Q3305213 .
+  ?item wdt:P18 ?image .
+  filter not exists { ?item wdt:P180 ?depicts }
+}
+'''

-def row_id(row):
-    return int(utils.drop_start(row['item']['value'], url_start))
+@app.template_global()
+def set_url_args(**new_args):
+    args = request.view_args.copy()
+    args.update(request.args)
+    args.update(new_args)
+    args = {k: v for k, v in args.items() if v is not None}
+    return url_for(request.endpoint, **args)

-def api_call(params, api_url=wikidata_url):
-    call_params = {
-        'format': 'json',
-        'formatversion': 2,
-        **params,
-    }
-
-    r = requests.get(wikidata_url, params=call_params)
-    return r
-
-def get_entity(qid):
-    json_data = api_call({'action': 'wbgetentities', 'ids': qid}).json()
-
-    try:
-        entity = list(json_data['entities'].values())[0]
-    except KeyError:
-        return
-    if 'missing' not in entity:
-        return entity
-
-def get_entities(ids, **params):
-    if not ids:
-        return []
-    params = {
-        'action': 'wbgetentities',
-        'ids': '|'.join(ids),
-        **params,
-    }
-    r = api_call(params)
-    json_data = r.json()
-    return list(json_data['entities'].values())
+@app.before_request
+def init_profile():
+    g.profiling = []

@app.route("/")
 def index():
    return render_template('index.html', props=find_more_props)

-def run_query_with_cache(q, name):
-    filename = f'cache/{name}.json'
-    if os.path.exists(filename):
-        from_cache = json.load(open(filename))
-        if isinstance(from_cache, dict) and from_cache.get('query') == q:
-            return from_cache['bindings']
-
-    r = run_wikidata_query(q)
-    bindings = r.json()['results']['bindings']
-    json.dump({'query': q, 'bindings': bindings},
-              open(filename, 'w'), indent=2)
-
-    return bindings
-
-def get_row_value(row, field):
-    return row[field]['value'] if field in row else None
-
@app.route("/property/P<int:property_id>")
 def property_query_page(property_id):
    pid = f'P{property_id}'
@ -167,7 +121,7 @@ def property_query_page(property_id):
    sort_by_name = sort and sort.lower().strip() == 'name'

    q = property_query.replace('PID', pid)
-    rows = run_query_with_cache(q, name=pid)
+    rows = wdqs.run_query_with_cache(q, name=pid)

    no_label_qid = [row['object']['value'].rpartition('/')[2]
                    for row in rows
@ -198,10 +152,35 @@ def property_query_page(property_id):
                           pid=pid,
                           rows=rows)

+@app.route('/random')
+def random_painting():
+    rows = wdqs.run_query_with_cache(painting_no_depicts_query)
+    row = random.choice(rows)
+    item_id = wdqs.row_id(row)
+    return redirect(url_for('item_page', item_id=item_id))
+
@app.route("/item/Q<int:item_id>")
 def item_page(item_id):
    qid = f'Q{item_id}'
-    return render_template('item.html', qid=qid)
+    item = painting.Painting(qid)
+
+    width = 800
+    image_filename = item.image_filename
+    filename = f'cache/{qid}_{width}_image.json'
+    if os.path.exists(filename):
+        detail = json.load(open(filename))
+    else:
+        detail = commons.image_detail([image_filename], thumbwidth=width)
+        json.dump(detail, open(filename, 'w'), indent=2)
+
+    hits = item.run_query()
+
+    return render_template('item.html',
+                           qid=qid,
+                           item=item,
+                           image=detail[image_filename],
+                           hits=hits,
+                           title=item.display_title)

 def get_entity_label(entity):
    if 'en' in entity['labels']:
@ -223,57 +202,18 @@ def get_labels(keys, name=None):
            labels = from_cache['labels']
    if not labels:
        for cur in utils.chunk(keys, 50):
-            labels += get_entities(cur, props='labels')
+            labels += mediawiki.get_entities(cur, props='labels')

        json.dump({'keys': keys, 'labels': labels},
                  open(filename, 'w'), indent=2)

    return {entity['id']: get_entity_label(entity) for entity in labels}

-def get_entity_with_cache(qid):
-    filename = f'cache/{qid}.json'
-    if os.path.exists(filename):
-        entity = json.load(open(filename))
-    else:
-        entity = get_entity(qid)
-        json.dump(entity, open(filename, 'w'), indent=2)
-
-    return entity
-
-def commons_uri_to_filename(uri):
-    return urllib.parse.unquote(utils.drop_start(uri, commons_start))
-
-def image_detail(filenames, thumbheight=None, thumbwidth=None):
-    if not isinstance(filenames, list):
-        filenames = [filenames]
-    if not filenames:
-        return {}
-
-    params = {
-        'action': 'query',
-        'titles': '|'.join(f'File:{f}' for f in filenames),
-        'prop': 'imageinfo',
-        'iiprop': 'url',
-    }
-    if thumbheight is not None:
-        params['iiurlheight'] = thumbheight
-    if thumbwidth is not None:
-        params['iiurlwidth'] = thumbwidth
-    r = api_call(params, api_url=commons_url)
-
-    images = {}
-
-    for image in r.json()['query']['pages']:
-        filename = utils.drop_start(image['title'], 'File:')
-        images[filename] = image['imageinfo'][0]
-
-    return images
-
@app.route("/next/Q<int:item_id>")
 def next_page(item_id):
    qid = f'Q{item_id}'

-    entity = get_entity_with_cache(qid)
+    entity = mediawiki.get_entity_with_cache(qid)

    width = 800
    image_filename = entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
@ -281,7 +221,7 @@ def next_page(item_id):
    if os.path.exists(filename):
        detail = json.load(open(filename))
    else:
-        detail = image_detail([image_filename], thumbwidth=width)
+        detail = commons.image_detail([image_filename], thumbwidth=width)
        json.dump(detail, open(filename, 'w'), indent=2)

    other_items = set()
@ -311,8 +251,7 @@ def next_page(item_id):
@app.route('/P<int:property_id>/Q<int:item_id>')
 def find_more_page(property_id, item_id):
    pid, qid = f'P{property_id}', f'Q{item_id}'
-
-    return redirect(url_for('browse_page') + f'?{pid}={qid}')
+    return redirect(url_for('browse_page', **{pid: qid}))

 def get_facets(sparql_params, params):
    flat = '_'.join(f'{pid}={qid}' for pid, qid in params)
@ -323,9 +262,7 @@ def get_facets(sparql_params, params):
    q = (facet_query.replace('PARAMS', sparql_params)
                    .replace('PROPERTY_LIST', property_list))

-    # open(f'cache/{flat}_facets_query.sparql', 'w').write(q)
-
-    bindings = run_query_with_cache(q, flat + '_facets')
+    bindings = wdqs.run_query_with_cache(q, flat + '_facets')

    facets = {key: [] for key in find_more_props.keys()}
    for row in bindings:
@ -342,21 +279,6 @@ def get_facets(sparql_params, params):
        if values
    }

-def format_time(row_time, row_timeprecision):
-    t = dateutil.parser.parse(row_time['value'])
-    precision = int(row_timeprecision['value'])
-
-    if precision == 9:
-        return t.year
-    if precision == 8:
-        return f'{t.year}s'
-    if precision == 7:
-        return f'{utils.ordinal((t.year // 100) + 1)} century'
-    if precision == 6:
-        return f'{utils.ordinal((t.year // 1000) + 1)} millennium'
-
-    return row_time['value']
-
@app.route('/browse')
 def browse_page():
    params = [(pid, qid) for pid, qid in request.args.items()
@ -374,49 +296,14 @@ def browse_page():
    sparql_params = ''.join(
        f'?item wdt:{pid} wd:{qid} .\n' for pid, qid in params)

-    query = find_more_query.replace('PARAMS', sparql_params)
-
-    filename = f'cache/{flat}.json'
-    if os.path.exists(filename):
-        bindings = json.load(open(filename))
-    else:
-        r = run_wikidata_query(query)
-        bindings = r.json()['results']['bindings']
-        json.dump(bindings, open(filename, 'w'), indent=2)
+    q = find_more_query.replace('PARAMS', sparql_params)

+    bindings = wdqs.run_query_with_cache(q, flat)
    facets = get_facets(sparql_params, params)

    page_size = 45

-    item_map = {}
-    for row in bindings:
-        item_id = row_id(row)
-        row_qid = f'Q{item_id}'
-        label = row['itemLabel']['value']
-        image_filename = commons_uri_to_filename(row['image']['value'])
-        if item_id in item_map:
-            item = item_map[item_id]
-            item['image_filename'].append(image_filename)
-            continue
-
-        if label == row_qid:
-            label = get_row_value('title') or 'name missing'
-
-        artist_name = get_row_value['artistLabel'] or '[artist unknown]'
-
-        d = format_time(row['time'], row['timeprecision']) if 'time' in row else None
-
-        item = {
-            'url': url_for('next_page', item_id=item_id),
-            'image_filename': [image_filename],
-            'item_id': item_id,
-            'qid': row_qid,
-            'label': label,
-            'date': d,
-            'artist_name': artist_name,
-        }
-        item_map[item_id] = item
-
+    item_map = wdqs.build_browse_item_map(bindings)
    items = []
    for item in item_map.values():
        if len(item['image_filename']) != 1:
@ -432,14 +319,13 @@ def browse_page():
    if os.path.exists(filename):
        detail = json.load(open(filename))
    else:
-        detail = image_detail(filenames, thumbwidth=thumbwidth)
+        detail = commons.image_detail(filenames, thumbwidth=thumbwidth)
        json.dump(detail, open(filename, 'w'), indent=2)

    for item in items:
+        item['url'] = url_for('item_page', item_id=item['item_id'])
        item['image'] = detail[item['image_filename']]

-    total = len(bindings)
-
    title = ' / '.join(item_labels[qid] for pid, qid in params)

    return render_template('find_more.html',
@ -448,8 +334,8 @@ def browse_page():
                           label=title,
                           labels=find_more_props,
                           bindings=bindings,
-                           items=items,
-                           total=total)
+                           total=len(bindings),
+                           items=items)


 if __name__ == "__main__":
--- a/depicts/category.py
+++ b/depicts/category.py
@ -0,0 +1,110 @@
+from . import utils
+import re
+import calendar
+
+month_pattern = '|'.join(m for m in calendar.month_name if m)
+re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
+
+ns_cat = 'Category:'
+
+class Category:
+    def __init__(self, title, site):
+        if title.startswith(ns_cat):
+            title = title[len(ns_cat):]
+        self.title = title
+        self.site = site
+        self.item = None
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
+
+    def set_item(self, item):
+        self.item = item
+
+    @property
+    def url(self):
+        return utils.wiki_url(self.title, self.site, ns='Category')
+
+    def date_based(self):
+        return bool(re_date_based.match(self.title))
+
+    def contains_artist_name(self):
+        if not self.item:
+            return
+        return any(artist.lower() in self.title.lower()
+                   for artist in self.item.artist_labels())
+
+    def parents(self):
+        if not self.item:
+            return []
+        return self.item.parent_categories[self.site].get(self.title, [])
+
+    def is_exhibition(self):
+        return any(parent.title.startswith('Art exhibitions ')
+                   for parent in self.parents())
+
+    def names_for_wikidata(self):
+        highlight = self.check()
+        interesting = len(highlight) > 1
+
+        if not interesting:
+            if self.date_based() or self.contains_artist_name() or self.is_exhibition():
+                return []
+
+            return utils.also_singular(self.title)
+
+        for significant, text in highlight:
+            if not significant:
+                continue
+            title = text.strip()
+            title = title[0].upper() + title[1:]
+            for sep in ' with ', ' at ', ' wearing ':
+                if sep in title:
+                    before, _, after = title.partition(sep)
+                    names = []
+                    for x in title, before, after:
+                        names += utils.also_singular(x)
+                    return names
+            return utils.also_singular(title)
+
+    def urls_for_wikidata(self):
+        return [utils.wiki_url(name, self.site, ns='Category')
+                for name in self.names_for_wikidata()]
+
+    def check(self):
+        cat = self.title
+        lc_cat = cat.lower()
+        by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
+                      'medium', 'year', 'painter']
+
+        if self.item:
+            by_endings += self.item.artist_labels()
+
+        for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
+            pos = lc_cat.find(after)
+            # don't highlight "1512 in art"
+            if pos == -1 or cat[:pos - 1].isdigit():
+                continue
+            return [(True, cat[:pos]), (False, cat[pos:])]
+
+        for before in ('paintings of', 'portraits of', 'landscapes of',
+                       'portraits with', 'paintings with', 'paintings depicting',
+                       'portraits depicting', 'landscapes depicting', 'works about'):
+            pos = lc_cat.find(before)
+            if pos == -1:
+                continue
+            pos += len(before)
+            for by_ending in by_endings:
+                ending = ' by ' + by_ending
+                if lc_cat.endswith(ending):
+                    return [(False, cat[:pos]),
+                            (True, cat[pos:-len(ending)]),
+                            (False, cat[-len(ending):])]
+
+            return [(False, cat[:pos]), (True, cat[pos:])]
+
+        pos = lc_cat.find('of ')
+        if pos != -1:
+            return [(True, cat[:pos]), (False, cat[pos:])]
+
+        return [(False, cat)]
--- a/depicts/commons.py
+++ b/depicts/commons.py
@ -0,0 +1,31 @@
+from . import mediawiki, utils
+
+commons_url = 'https://www.wikidata.org/w/api.php'
+
+def image_detail(filenames, thumbheight=None, thumbwidth=None):
+    if not isinstance(filenames, list):
+        filenames = [filenames]
+    if not filenames:
+        return {}
+
+    params = {
+        'action': 'query',
+        'titles': '|'.join(f'File:{f}' for f in filenames),
+        'prop': 'imageinfo',
+        'iiprop': 'url',
+    }
+    if thumbheight is not None:
+        params['iiurlheight'] = thumbheight
+    if thumbwidth is not None:
+        params['iiurlwidth'] = thumbwidth
+    r = mediawiki.api_call(params, api_url=commons_url)
+
+    images = {}
+
+    for image in r.json()['query']['pages']:
+        filename = utils.drop_start(image['title'], 'File:')
+        images[filename] = image['imageinfo'][0]
+
+    return images
+
+
--- a/depicts/painting.py
+++ b/depicts/painting.py
@ -0,0 +1,306 @@
+from . import utils, wdqs, mediawiki
+import nltk
+import re
+
+re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I)
+
+ignore_for_depicts = {
+    43445,     # female organism - use: female (Q6581072)
+    44148,     # male organism   - use: male (Q6581097)
+    21075684,  # children        - use: child (Q7569)
+    180788,    # National Gallery
+    780294,    # human physical appearance
+    2472587,   # people
+    33659,     # People
+}
+
+query = '''
+select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink
+where {
+  service wikibase:label { bd:serviceParam wikibase:language "en" }
+  filter (?item != wd:QID)
+
+  {
+    VALUES (?commonscat) { COMMONS_CAT }
+    ?item wdt:P373 ?commonscat .
+    filter not exists { ?item wdt:P31 wd:Q4167836 }  # Wikimedia category
+    filter not exists { ?item wdt:P31 wd:Q4167410 }  # Wikimedia disambiguation page
+    filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs
+    filter not exists { ?item wdt:P31 wd:Q4167836 }  # Wikimedia list article
+    filter not exists { ?item wdt:P31 wd:Q4663903 }  # Wikimedia portal
+  } union {
+    VALUES (?commonscat) { COMMONS_CAT }
+    ?cat_item wdt:P373 ?commonscat .
+    ?cat_item wdt:P301 ?item .
+  } union {
+    VALUES (?cat_url) { CAT_URL }
+    ?cat_url schema:about ?cat_item .
+    ?cat_item wdt:P301 ?item .
+  } union {
+    VALUES (?sitelink) { SITELINK }
+    ?sitelink schema:about ?item .
+    filter not exists { ?item wdt:P31 wd:Q4167410 }
+  }
+}'''
+
+class QueryResultRow:
+    def __init__(self, row):
+        self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()}
+        self.item_id = wdqs.row_id(row)
+        self.label = wdqs.get_row_value(row, 'itemLabel')
+
+    def update(self, row):
+        for key, value in row.items():
+            if key.startswith('item'):
+                continue
+            self.row.setdefault(key, []).append(value)
+
+    @property
+    def url(self):
+        return self.row['item']['value']
+
+    @property
+    def qid(self):
+        return f'Q{self.item_id}'
+
+    def sources(self):
+        return {k: v for k, v in self.row.items() if not k.startswith('item')}
+
+    def sources_list(self):
+
+        def get_value(i):
+            if i['type'] != 'uri':
+                return i['value']
+            wiki_start = i['value'].rfind('/wiki/')
+            return i['value'][wiki_start + 6:]
+
+        return [(k, [get_value(i) for i in v])
+                for k, v in self.row.items()
+                if not k.startswith('item')]
+
+class Painting:
+    def __init__(self, qid):
+        self.entity = mediawiki.get_entity_with_cache(qid)
+        self.item_id = int(qid[1:])
+
+        if self.enwiki:
+            content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki')
+            self.enwiki_content = content
+            self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki')
+            for cat in self.enwiki_categories:
+                cat.set_item(self)
+        else:
+            self.enwiki_content = None
+            self.enwiki_categories = None
+
+        sites = ['commons', 'enwiki']
+        self.parent_categories = {site: {} for site in sites}
+
+        self.categories = self.get_categories()
+
+    @property
+    def image_filename(self):
+        return self.entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
+
+    @property
+    def display_title(self):
+        if 'en' not in self.entity['labels']:
+            return self.qid
+        return f'{self.en_title} ({self.qid})'
+
+    @property
+    def url(self):
+        return 'https://www.wikidata.org/wiki/' + self.qid
+
+    def get_artist_entities(self):
+        self.artist_entities = []
+
+        for artist in self.artists_claim:
+            artist_qid = artist['id']
+            self.artist_entities.append(mediawiki.get_entity(artist_qid))
+
+    def artist_labels(self):
+        if not hasattr(self, 'artist_entities'):
+            self.get_artist_entities()
+        return [artist['labels']['en']['value'] for artist in self.artist_entities]
+
+    @property
+    def commons_cats(self):
+        return [i['mainsnak']['datavalue']['value']
+                for i in self.entity['claims'].get('P373', [])]
+
+    @property
+    def commons_sitelink(self):
+        return self.sitelinks['commons']['value'] if 'commons' in self.sitelinks else None
+
+    @property
+    def en_title(self):
+        if 'en' in self.entity['labels']:
+            return self.entity['labels']['en']['value']
+        else:
+            return self.qid
+
+    @property
+    def artists_claim(self):
+        return [image['mainsnak']['datavalue']['value']
+                 for image in self.entity['claims'].get('P170', [])]
+
+    @property
+    def artists(self):
+        if not hasattr(self, 'artist_entities'):
+            self.get_artist_entities()
+
+        items = [image['mainsnak']['datavalue']['value']
+                 for image in self.entity['claims'].get('P170', [])]
+
+        lookup = {artist['id']: artist['labels'] for artist in self.artist_entities}
+
+        for item in items:
+            item['labels'] = lookup[item['id']]
+
+        return items
+
+    @property
+    def qid(self):
+        return f'Q{self.item_id}'
+
+    @property
+    def commons_filenames(self):
+        return [image['mainsnak']['datavalue']['value']
+                for image in self.entity['claims'].get('P18', [])]
+
+    def commons_cat_from_sitelink(self):
+        ns = 'Category:'
+        if not self.commons_sitelink or not self.commons_sitelink.startswith(ns):
+            return
+        return self.commons_sitelink[len(ns):]
+
+    @property
+    def enwiki_url(self):
+        enwiki = self.enwiki
+        if not enwiki:
+            return
+        return 'https://en.wikipedia.org/wiki/' + enwiki.replace(' ', '_')
+
+    @property
+    def sitelinks(self):
+        return self.entity['sitelinks']
+
+    @property
+    def claims(self):
+        return self.entity['claims']
+
+    @property
+    def enwiki(self):
+        return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
+
+    def get_categories(self):
+        titles = {'File:' + filename for filename in self.commons_filenames}
+        for commons_cat in self.commons_cats:
+            titles.add('Category:' + commons_cat)
+        if self.commons_sitelink:
+            titles.add(self.commons_sitelink)
+        if not titles:
+            return []
+
+        cat_list = mediawiki.get_categories(titles, 'commons')
+
+        for title, cats in cat_list:
+            for cat in cats:
+                cat.set_item(self)
+            if not title.startswith('Category:'):
+                continue
+            self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
+
+        get_more_cats = []
+        for _, cats in self.parent_categories['commons'].items():
+            for cat in cats:
+                if cat.title not in self.parent_categories:
+                    get_more_cats.append('Category:' + cat.title)
+
+        for title, cats in mediawiki.get_categories(get_more_cats, 'commons'):
+            for cat in cats:
+                cat.set_item(self)
+            self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
+
+        if self.enwiki:
+            cat_list.append((self.enwiki, self.enwiki_categories))
+
+            get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories]
+            for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'):
+                self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats
+
+        return cat_list
+
+    def depicts_from_enwiki_content(self):
+        if not self.enwiki_url:
+            return
+        for par in self.enwiki_content.split('\n\n'):
+            m = re_from_article.search(par)
+            if m:
+                return m.group(1)
+
+    def query_variables(self):
+        commons_cat = []
+        cat_url = []
+        keywords = []
+        for _, categories in self.categories:
+            for cat in categories:
+                names = cat.names_for_wikidata()
+                keywords += names
+                if cat.site == 'commons':
+                    commons_cat += names
+                cat_url += cat.urls_for_wikidata()
+
+        text = self.depicts_from_enwiki_content()
+        if text:
+            sentences = nltk.sent_tokenize(text)
+
+            for sentence in sentences:
+                for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
+                    if not utils.word_contains_letter(word):
+                        continue
+                    if not pos.startswith('NN'):
+                        continue
+                    word = word.strip('|')
+                    for k in word.strip('|').split('|'):
+                        if utils.word_contains_letter(k):
+                            keywords += utils.also_singular(k)
+
+        keywords = [k for k in keywords if utils.word_contains_letter(k)]
+
+        return {
+            'commons_cat': commons_cat,
+            'cat_url': cat_url,
+            'keywords': keywords,
+        }
+
+    def build_query(self):
+        query_vars = self.query_variables()
+        sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']]
+        sitelinks = [url for url in sitelinks if url]
+
+        q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat']))
+        q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url']))
+        q = q.replace('QID', self.qid)
+        q = q.replace('SITELINK', wdqs.url_list(sitelinks))
+        return q
+
+    def run_query(self):
+        query = self.build_query()
+
+        rows = wdqs.run_query_with_cache(query)
+        by_id = {}
+        results = []
+        for row in rows:
+            item_id = wdqs.row_id(row)
+            if item_id in ignore_for_depicts:
+                continue
+            if item_id in by_id:
+                by_id[item_id].update(row)
+                continue
+            hit = QueryResultRow(row)
+            by_id[item_id] = hit
+            results.append(hit)
+
+        return sorted(results, key=lambda hit: hit.item_id)
--- a/depicts/utils.py
+++ b/depicts/utils.py
@ -1,4 +1,18 @@
 from itertools import islice
+import urllib.parse
+import inflect
+
+hosts = {
+    'commons': 'commons.wikimedia.org',
+    'enwiki': 'en.wikipedia.org',
+    'wikidata': 'www.wikidata.org',
+}
+
+engine = inflect.engine()
+
+skip_names = {
+    'National Gallery'
+}

 def ordinal(n):
    return "%d%s" % (n, 'tsnrhtdd'[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10::4])
@ -10,3 +24,50 @@ def chunk(it, size):
 def drop_start(s, start):
    assert s.startswith(start)
    return s[len(start):]
+
+def drop_category_ns(s):
+    return drop_start(s, 'Category:')
+
+def word_contains_letter(word):
+    return any(c.isalpha() for c in word)
+
+def also_singular(name):
+    names = also_singular_main(name)
+    extra = []
+    for n in names:
+        words = set(n.lower().split())
+        for word in 'girl', 'boy':
+            if word in words:
+                extra.append(word)
+        if {'female', 'females', 'women'} & words:
+            extra.append('woman')
+        if {'male', 'males', 'men'} & words:
+            extra.append('man')
+    return [n for n in names + extra if n not in skip_names]
+
+def also_singular_main(name):
+    '''
+    given a singular name return a list of both the plural and singular versions
+    just return the name if it isn't singular
+    '''
+    singular = engine.singular_noun(name.strip('|'))
+    if not singular:
+        return [name]
+    n, s = name.lower(), singular.lower()
+    if (n == s or
+            n.replace('paintings', '') == s.replace('painting', '') or
+            n == 'venus' and s == 'venu'):
+        return [name]
+    return [name, singular]
+
+def wiki_url(title, site, ns=None):
+    host = hosts[site]
+    url_ns = ns + ':' if ns else ''
+    if not title:
+        return
+    if title[0].islower():
+        title = title[0].upper() + title[1:]
+
+    return f'https://{host}/wiki/' + url_ns + urllib.parse.quote(title.replace(' ', '_'))
+
+
--- a/depicts/wdqs.py
+++ b/depicts/wdqs.py
@ -0,0 +1,100 @@
+import requests
+import json
+import urllib.parse
+import os
+import dateutil.parser
+import hashlib
+from . import utils
+
+query_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
+url_start = 'http://www.wikidata.org/entity/Q'
+commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
+
+def row_id(row):
+    return int(utils.drop_start(row['item']['value'], url_start))
+
+def get_row_value(row, field):
+    return row[field]['value'] if field in row else None
+
+def commons_uri_to_filename(uri):
+    return urllib.parse.unquote(utils.drop_start(uri, commons_start))
+
+def run_query(query):
+    params = {'query': query, 'format': 'json'}
+    r = requests.post(query_url, data=params, stream=True)
+    assert r.status_code == 200
+    return r
+
+def md5_query(query):
+    ''' generate the md5 hexdigest of a SPARQL query '''
+    return hashlib.md5(query.encode('utf-8')).hexdigest()
+
+def run_query_with_cache(q, name=None):
+    if name is None:
+        name = md5_query(q)
+    filename = f'cache/{name}.json'
+    if os.path.exists(filename):
+        from_cache = json.load(open(filename))
+        if isinstance(from_cache, dict) and from_cache.get('query') == q:
+            return from_cache['bindings']
+
+    r = run_query(q)
+    bindings = r.json()['results']['bindings']
+    json.dump({'query': q, 'bindings': bindings},
+              open(filename, 'w'), indent=2)
+
+    return bindings
+
+def format_time(row_time, row_timeprecision):
+    t = dateutil.parser.parse(row_time['value'])
+    precision = int(row_timeprecision['value'])
+
+    if precision == 9:
+        return t.year
+    if precision == 8:
+        return f'{t.year}s'
+    if precision == 7:
+        return f'{utils.ordinal((t.year // 100) + 1)} century'
+    if precision == 6:
+        return f'{utils.ordinal((t.year // 1000) + 1)} millennium'
+
+    return row_time['value']
+
+def build_browse_item_map(bindings):
+    item_map = {}
+    for row in bindings:
+        item_id = row_id(row)
+        row_qid = f'Q{item_id}'
+        label = row['itemLabel']['value']
+        image_filename = commons_uri_to_filename(row['image']['value'])
+        if item_id in item_map:
+            item = item_map[item_id]
+            item['image_filename'].append(image_filename)
+            continue
+
+        if label == row_qid:
+            label = get_row_value(row, 'title') or 'name missing'
+
+        artist_name = get_row_value(row, 'artistLabel') or '[artist unknown]'
+
+        d = format_time(row['time'], row['timeprecision']) if 'time' in row else None
+
+        item = {
+            'image_filename': [image_filename],
+            'item_id': item_id,
+            'qid': row_qid,
+            'label': label,
+            'date': d,
+            'artist_name': artist_name,
+        }
+        item_map[item_id] = item
+
+    return item_map
+
+def quote_list(l):
+    no_dups = list(dict.fromkeys(l))  # remove duplicates
+    return ' '.join('("' + s.replace('"', '\\"') + '")' for s in no_dups)
+
+def url_list(l):
+    no_dups = list(dict.fromkeys(l))  # remove duplicates
+    return ' '.join(f'(<{s}>)' for s in no_dups)
--- a/templates/find_more.html
+++ b/templates/find_more.html
@ -24,7 +24,7 @@
  {% for key, values in facets.items() %}
    <p>{{ prop_labels[key] }}:
      {% for v in values %}
-        <a href="?{{ request.query_string.decode('utf-8') }}&{{key}}={{v.qid}}">{{ v.label }}</a> ({{ v.count }})
+        <a href="{{ set_url_args(**{key: v.qid}) }}">{{ v.label }}</a> ({{ v.count }})
        {% if not loop.last %}|{% endif %}
      {% endfor %}
    </p>
--- a/templates/index.html
+++ b/templates/index.html
@ -4,6 +4,7 @@

 {% block content %}
 <div class="m-3">
+  <p><a href="{{ url_for('random_painting') }}">random painting</a></p>
  <ul>
  {% for pid, label in props.items() %}
    <li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a>
--- a/templates/item.html
+++ b/templates/item.html
@ -0,0 +1,34 @@
+{% extends "base.html" %}
+
+{% block title %}{{ label }} ({{qid }}){% endblock %}
+
+{% block content %}
+<div class="m-3">
+  <h1>{{ self.title() }}</h1>
+  <div class="row">
+    <div class="col">
+      <img src="{{ image.thumburl }}">
+    </div>
+
+    <div class="col">
+      <p><a href="https://www.wikidata.org/wiki/{{ qid }}">view on Wikidata</a></p>
+      <p><a href="{{ url_for('random_painting') }}">random painting</a></p>
+      {% for hit in hits %}
+        <p>
+        url: {{ hit.url }}<br>
+        label: {{ hit.label }}<br>
+        qid: {{ hit.qid }}<br>
+        sources: {{ hit.sources() }}<br>
+        </p>
+      {% endfor %}
+
+    </div>
+  </div>
+
+  <pre>{{ item.query_variables() | pprint }}</pre>
+
+  <pre>{{ item.build_query() }}</pre>
+
+
+</div>
+{% endblock %}
--- a/templates/property.html
+++ b/templates/property.html
@ -10,9 +10,9 @@

  <p>Sort order:
  {% if order == 'name' %}
-    <b>name</b> or <a href="?sort=count">count</a>
+    <b>name</b> or <a href="{{ set_url_args(sort='count') }}">count</a>
  {% else %}
-    <a href="?sort=name">name</a> or <b>count</b>
+    <a href="{{ set_url_args(sort='name') }}">name</a> or <b>count</b>
  {% endif %}
  </p>

@ -21,7 +21,7 @@
    {% set qid = row.object.value.rpartition('/')[2] %}
    {% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %}
    <li>
-    <a href="{{ url_for('browse_page') }}?{{ pid }}={{ qid }}">{{ row_label }}</a>
+      <a href="{{ url_for('browse_page', **{pid: qid}) }}">{{ row_label }}</a>
    {% if 'objectDescription' in row %}
    &mdash; {{ row.objectDescription.value }}
    {% endif %}