diff --git a/app.py b/app.py index 2fbee15..80ca19b 100755 --- a/app.py +++ b/app.py @@ -1,23 +1,15 @@ #!/usr/bin/python3 -from flask import Flask, render_template, url_for, redirect, request -from depicts import utils -import dateutil.parser -import urllib.parse -import requests +from flask import Flask, render_template, url_for, redirect, request, g +from depicts import utils, wdqs, commons, mediawiki, painting import json import os import locale +import random locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') -url_start = 'http://www.wikidata.org/entity/Q' -wikidata_url = 'https://www.wikidata.org/w/api.php' -commons_url = 'https://www.wikidata.org/w/api.php' -wikidata_query_api_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql' -commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/' thumbwidth = 300 -thumbheight = 400 app = Flask(__name__) @@ -98,68 +90,30 @@ select ?object ?objectLabel ?objectDescription (count(*) as ?count) { order by desc(?count) ''' -def run_wikidata_query(query): - params = {'query': query, 'format': 'json'} - r = requests.post(wikidata_query_api_url, data=params, stream=True) - assert r.status_code == 200 - return r +painting_no_depicts_query = ''' +select distinct ?item where { + ?item wdt:P31 wd:Q3305213 . + ?item wdt:P18 ?image . + filter not exists { ?item wdt:P180 ?depicts } +} +''' -def row_id(row): - return int(utils.drop_start(row['item']['value'], url_start)) +@app.template_global() +def set_url_args(**new_args): + args = request.view_args.copy() + args.update(request.args) + args.update(new_args) + args = {k: v for k, v in args.items() if v is not None} + return url_for(request.endpoint, **args) -def api_call(params, api_url=wikidata_url): - call_params = { - 'format': 'json', - 'formatversion': 2, - **params, - } - - r = requests.get(wikidata_url, params=call_params) - return r - -def get_entity(qid): - json_data = api_call({'action': 'wbgetentities', 'ids': qid}).json() - - try: - entity = list(json_data['entities'].values())[0] - except KeyError: - return - if 'missing' not in entity: - return entity - -def get_entities(ids, **params): - if not ids: - return [] - params = { - 'action': 'wbgetentities', - 'ids': '|'.join(ids), - **params, - } - r = api_call(params) - json_data = r.json() - return list(json_data['entities'].values()) +@app.before_request +def init_profile(): + g.profiling = [] @app.route("/") def index(): return render_template('index.html', props=find_more_props) -def run_query_with_cache(q, name): - filename = f'cache/{name}.json' - if os.path.exists(filename): - from_cache = json.load(open(filename)) - if isinstance(from_cache, dict) and from_cache.get('query') == q: - return from_cache['bindings'] - - r = run_wikidata_query(q) - bindings = r.json()['results']['bindings'] - json.dump({'query': q, 'bindings': bindings}, - open(filename, 'w'), indent=2) - - return bindings - -def get_row_value(row, field): - return row[field]['value'] if field in row else None - @app.route("/property/P") def property_query_page(property_id): pid = f'P{property_id}' @@ -167,7 +121,7 @@ def property_query_page(property_id): sort_by_name = sort and sort.lower().strip() == 'name' q = property_query.replace('PID', pid) - rows = run_query_with_cache(q, name=pid) + rows = wdqs.run_query_with_cache(q, name=pid) no_label_qid = [row['object']['value'].rpartition('/')[2] for row in rows @@ -198,10 +152,35 @@ def property_query_page(property_id): pid=pid, rows=rows) +@app.route('/random') +def random_painting(): + rows = wdqs.run_query_with_cache(painting_no_depicts_query) + row = random.choice(rows) + item_id = wdqs.row_id(row) + return redirect(url_for('item_page', item_id=item_id)) + @app.route("/item/Q") def item_page(item_id): qid = f'Q{item_id}' - return render_template('item.html', qid=qid) + item = painting.Painting(qid) + + width = 800 + image_filename = item.image_filename + filename = f'cache/{qid}_{width}_image.json' + if os.path.exists(filename): + detail = json.load(open(filename)) + else: + detail = commons.image_detail([image_filename], thumbwidth=width) + json.dump(detail, open(filename, 'w'), indent=2) + + hits = item.run_query() + + return render_template('item.html', + qid=qid, + item=item, + image=detail[image_filename], + hits=hits, + title=item.display_title) def get_entity_label(entity): if 'en' in entity['labels']: @@ -223,57 +202,18 @@ def get_labels(keys, name=None): labels = from_cache['labels'] if not labels: for cur in utils.chunk(keys, 50): - labels += get_entities(cur, props='labels') + labels += mediawiki.get_entities(cur, props='labels') json.dump({'keys': keys, 'labels': labels}, open(filename, 'w'), indent=2) return {entity['id']: get_entity_label(entity) for entity in labels} -def get_entity_with_cache(qid): - filename = f'cache/{qid}.json' - if os.path.exists(filename): - entity = json.load(open(filename)) - else: - entity = get_entity(qid) - json.dump(entity, open(filename, 'w'), indent=2) - - return entity - -def commons_uri_to_filename(uri): - return urllib.parse.unquote(utils.drop_start(uri, commons_start)) - -def image_detail(filenames, thumbheight=None, thumbwidth=None): - if not isinstance(filenames, list): - filenames = [filenames] - if not filenames: - return {} - - params = { - 'action': 'query', - 'titles': '|'.join(f'File:{f}' for f in filenames), - 'prop': 'imageinfo', - 'iiprop': 'url', - } - if thumbheight is not None: - params['iiurlheight'] = thumbheight - if thumbwidth is not None: - params['iiurlwidth'] = thumbwidth - r = api_call(params, api_url=commons_url) - - images = {} - - for image in r.json()['query']['pages']: - filename = utils.drop_start(image['title'], 'File:') - images[filename] = image['imageinfo'][0] - - return images - @app.route("/next/Q") def next_page(item_id): qid = f'Q{item_id}' - entity = get_entity_with_cache(qid) + entity = mediawiki.get_entity_with_cache(qid) width = 800 image_filename = entity['claims']['P18'][0]['mainsnak']['datavalue']['value'] @@ -281,7 +221,7 @@ def next_page(item_id): if os.path.exists(filename): detail = json.load(open(filename)) else: - detail = image_detail([image_filename], thumbwidth=width) + detail = commons.image_detail([image_filename], thumbwidth=width) json.dump(detail, open(filename, 'w'), indent=2) other_items = set() @@ -311,8 +251,7 @@ def next_page(item_id): @app.route('/P/Q') def find_more_page(property_id, item_id): pid, qid = f'P{property_id}', f'Q{item_id}' - - return redirect(url_for('browse_page') + f'?{pid}={qid}') + return redirect(url_for('browse_page', **{pid: qid})) def get_facets(sparql_params, params): flat = '_'.join(f'{pid}={qid}' for pid, qid in params) @@ -323,9 +262,7 @@ def get_facets(sparql_params, params): q = (facet_query.replace('PARAMS', sparql_params) .replace('PROPERTY_LIST', property_list)) - # open(f'cache/{flat}_facets_query.sparql', 'w').write(q) - - bindings = run_query_with_cache(q, flat + '_facets') + bindings = wdqs.run_query_with_cache(q, flat + '_facets') facets = {key: [] for key in find_more_props.keys()} for row in bindings: @@ -342,21 +279,6 @@ def get_facets(sparql_params, params): if values } -def format_time(row_time, row_timeprecision): - t = dateutil.parser.parse(row_time['value']) - precision = int(row_timeprecision['value']) - - if precision == 9: - return t.year - if precision == 8: - return f'{t.year}s' - if precision == 7: - return f'{utils.ordinal((t.year // 100) + 1)} century' - if precision == 6: - return f'{utils.ordinal((t.year // 1000) + 1)} millennium' - - return row_time['value'] - @app.route('/browse') def browse_page(): params = [(pid, qid) for pid, qid in request.args.items() @@ -374,49 +296,14 @@ def browse_page(): sparql_params = ''.join( f'?item wdt:{pid} wd:{qid} .\n' for pid, qid in params) - query = find_more_query.replace('PARAMS', sparql_params) - - filename = f'cache/{flat}.json' - if os.path.exists(filename): - bindings = json.load(open(filename)) - else: - r = run_wikidata_query(query) - bindings = r.json()['results']['bindings'] - json.dump(bindings, open(filename, 'w'), indent=2) + q = find_more_query.replace('PARAMS', sparql_params) + bindings = wdqs.run_query_with_cache(q, flat) facets = get_facets(sparql_params, params) page_size = 45 - item_map = {} - for row in bindings: - item_id = row_id(row) - row_qid = f'Q{item_id}' - label = row['itemLabel']['value'] - image_filename = commons_uri_to_filename(row['image']['value']) - if item_id in item_map: - item = item_map[item_id] - item['image_filename'].append(image_filename) - continue - - if label == row_qid: - label = get_row_value('title') or 'name missing' - - artist_name = get_row_value['artistLabel'] or '[artist unknown]' - - d = format_time(row['time'], row['timeprecision']) if 'time' in row else None - - item = { - 'url': url_for('next_page', item_id=item_id), - 'image_filename': [image_filename], - 'item_id': item_id, - 'qid': row_qid, - 'label': label, - 'date': d, - 'artist_name': artist_name, - } - item_map[item_id] = item - + item_map = wdqs.build_browse_item_map(bindings) items = [] for item in item_map.values(): if len(item['image_filename']) != 1: @@ -432,14 +319,13 @@ def browse_page(): if os.path.exists(filename): detail = json.load(open(filename)) else: - detail = image_detail(filenames, thumbwidth=thumbwidth) + detail = commons.image_detail(filenames, thumbwidth=thumbwidth) json.dump(detail, open(filename, 'w'), indent=2) for item in items: + item['url'] = url_for('item_page', item_id=item['item_id']) item['image'] = detail[item['image_filename']] - total = len(bindings) - title = ' / '.join(item_labels[qid] for pid, qid in params) return render_template('find_more.html', @@ -448,8 +334,8 @@ def browse_page(): label=title, labels=find_more_props, bindings=bindings, - items=items, - total=total) + total=len(bindings), + items=items) if __name__ == "__main__": diff --git a/depicts/category.py b/depicts/category.py new file mode 100644 index 0000000..2d350e0 --- /dev/null +++ b/depicts/category.py @@ -0,0 +1,110 @@ +from . import utils +import re +import calendar + +month_pattern = '|'.join(m for m in calendar.month_name if m) +re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ') + +ns_cat = 'Category:' + +class Category: + def __init__(self, title, site): + if title.startswith(ns_cat): + title = title[len(ns_cat):] + self.title = title + self.site = site + self.item = None + + def __repr__(self): + return f'{self.__class__.__name__}({self.title!r}, {self.site!r})' + + def set_item(self, item): + self.item = item + + @property + def url(self): + return utils.wiki_url(self.title, self.site, ns='Category') + + def date_based(self): + return bool(re_date_based.match(self.title)) + + def contains_artist_name(self): + if not self.item: + return + return any(artist.lower() in self.title.lower() + for artist in self.item.artist_labels()) + + def parents(self): + if not self.item: + return [] + return self.item.parent_categories[self.site].get(self.title, []) + + def is_exhibition(self): + return any(parent.title.startswith('Art exhibitions ') + for parent in self.parents()) + + def names_for_wikidata(self): + highlight = self.check() + interesting = len(highlight) > 1 + + if not interesting: + if self.date_based() or self.contains_artist_name() or self.is_exhibition(): + return [] + + return utils.also_singular(self.title) + + for significant, text in highlight: + if not significant: + continue + title = text.strip() + title = title[0].upper() + title[1:] + for sep in ' with ', ' at ', ' wearing ': + if sep in title: + before, _, after = title.partition(sep) + names = [] + for x in title, before, after: + names += utils.also_singular(x) + return names + return utils.also_singular(title) + + def urls_for_wikidata(self): + return [utils.wiki_url(name, self.site, ns='Category') + for name in self.names_for_wikidata()] + + def check(self): + cat = self.title + lc_cat = cat.lower() + by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement', + 'medium', 'year', 'painter'] + + if self.item: + by_endings += self.item.artist_labels() + + for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'): + pos = lc_cat.find(after) + # don't highlight "1512 in art" + if pos == -1 or cat[:pos - 1].isdigit(): + continue + return [(True, cat[:pos]), (False, cat[pos:])] + + for before in ('paintings of', 'portraits of', 'landscapes of', + 'portraits with', 'paintings with', 'paintings depicting', + 'portraits depicting', 'landscapes depicting', 'works about'): + pos = lc_cat.find(before) + if pos == -1: + continue + pos += len(before) + for by_ending in by_endings: + ending = ' by ' + by_ending + if lc_cat.endswith(ending): + return [(False, cat[:pos]), + (True, cat[pos:-len(ending)]), + (False, cat[-len(ending):])] + + return [(False, cat[:pos]), (True, cat[pos:])] + + pos = lc_cat.find('of ') + if pos != -1: + return [(True, cat[:pos]), (False, cat[pos:])] + + return [(False, cat)] diff --git a/depicts/commons.py b/depicts/commons.py new file mode 100644 index 0000000..dd7df26 --- /dev/null +++ b/depicts/commons.py @@ -0,0 +1,31 @@ +from . import mediawiki, utils + +commons_url = 'https://www.wikidata.org/w/api.php' + +def image_detail(filenames, thumbheight=None, thumbwidth=None): + if not isinstance(filenames, list): + filenames = [filenames] + if not filenames: + return {} + + params = { + 'action': 'query', + 'titles': '|'.join(f'File:{f}' for f in filenames), + 'prop': 'imageinfo', + 'iiprop': 'url', + } + if thumbheight is not None: + params['iiurlheight'] = thumbheight + if thumbwidth is not None: + params['iiurlwidth'] = thumbwidth + r = mediawiki.api_call(params, api_url=commons_url) + + images = {} + + for image in r.json()['query']['pages']: + filename = utils.drop_start(image['title'], 'File:') + images[filename] = image['imageinfo'][0] + + return images + + diff --git a/depicts/painting.py b/depicts/painting.py new file mode 100644 index 0000000..90d72b9 --- /dev/null +++ b/depicts/painting.py @@ -0,0 +1,306 @@ +from . import utils, wdqs, mediawiki +import nltk +import re + +re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I) + +ignore_for_depicts = { + 43445, # female organism - use: female (Q6581072) + 44148, # male organism - use: male (Q6581097) + 21075684, # children - use: child (Q7569) + 180788, # National Gallery + 780294, # human physical appearance + 2472587, # people + 33659, # People +} + +query = ''' +select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink +where { + service wikibase:label { bd:serviceParam wikibase:language "en" } + filter (?item != wd:QID) + + { + VALUES (?commonscat) { COMMONS_CAT } + ?item wdt:P373 ?commonscat . + filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia category + filter not exists { ?item wdt:P31 wd:Q4167410 } # Wikimedia disambiguation page + filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs + filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia list article + filter not exists { ?item wdt:P31 wd:Q4663903 } # Wikimedia portal + } union { + VALUES (?commonscat) { COMMONS_CAT } + ?cat_item wdt:P373 ?commonscat . + ?cat_item wdt:P301 ?item . + } union { + VALUES (?cat_url) { CAT_URL } + ?cat_url schema:about ?cat_item . + ?cat_item wdt:P301 ?item . + } union { + VALUES (?sitelink) { SITELINK } + ?sitelink schema:about ?item . + filter not exists { ?item wdt:P31 wd:Q4167410 } + } +}''' + +class QueryResultRow: + def __init__(self, row): + self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()} + self.item_id = wdqs.row_id(row) + self.label = wdqs.get_row_value(row, 'itemLabel') + + def update(self, row): + for key, value in row.items(): + if key.startswith('item'): + continue + self.row.setdefault(key, []).append(value) + + @property + def url(self): + return self.row['item']['value'] + + @property + def qid(self): + return f'Q{self.item_id}' + + def sources(self): + return {k: v for k, v in self.row.items() if not k.startswith('item')} + + def sources_list(self): + + def get_value(i): + if i['type'] != 'uri': + return i['value'] + wiki_start = i['value'].rfind('/wiki/') + return i['value'][wiki_start + 6:] + + return [(k, [get_value(i) for i in v]) + for k, v in self.row.items() + if not k.startswith('item')] + +class Painting: + def __init__(self, qid): + self.entity = mediawiki.get_entity_with_cache(qid) + self.item_id = int(qid[1:]) + + if self.enwiki: + content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki') + self.enwiki_content = content + self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki') + for cat in self.enwiki_categories: + cat.set_item(self) + else: + self.enwiki_content = None + self.enwiki_categories = None + + sites = ['commons', 'enwiki'] + self.parent_categories = {site: {} for site in sites} + + self.categories = self.get_categories() + + @property + def image_filename(self): + return self.entity['claims']['P18'][0]['mainsnak']['datavalue']['value'] + + @property + def display_title(self): + if 'en' not in self.entity['labels']: + return self.qid + return f'{self.en_title} ({self.qid})' + + @property + def url(self): + return 'https://www.wikidata.org/wiki/' + self.qid + + def get_artist_entities(self): + self.artist_entities = [] + + for artist in self.artists_claim: + artist_qid = artist['id'] + self.artist_entities.append(mediawiki.get_entity(artist_qid)) + + def artist_labels(self): + if not hasattr(self, 'artist_entities'): + self.get_artist_entities() + return [artist['labels']['en']['value'] for artist in self.artist_entities] + + @property + def commons_cats(self): + return [i['mainsnak']['datavalue']['value'] + for i in self.entity['claims'].get('P373', [])] + + @property + def commons_sitelink(self): + return self.sitelinks['commons']['value'] if 'commons' in self.sitelinks else None + + @property + def en_title(self): + if 'en' in self.entity['labels']: + return self.entity['labels']['en']['value'] + else: + return self.qid + + @property + def artists_claim(self): + return [image['mainsnak']['datavalue']['value'] + for image in self.entity['claims'].get('P170', [])] + + @property + def artists(self): + if not hasattr(self, 'artist_entities'): + self.get_artist_entities() + + items = [image['mainsnak']['datavalue']['value'] + for image in self.entity['claims'].get('P170', [])] + + lookup = {artist['id']: artist['labels'] for artist in self.artist_entities} + + for item in items: + item['labels'] = lookup[item['id']] + + return items + + @property + def qid(self): + return f'Q{self.item_id}' + + @property + def commons_filenames(self): + return [image['mainsnak']['datavalue']['value'] + for image in self.entity['claims'].get('P18', [])] + + def commons_cat_from_sitelink(self): + ns = 'Category:' + if not self.commons_sitelink or not self.commons_sitelink.startswith(ns): + return + return self.commons_sitelink[len(ns):] + + @property + def enwiki_url(self): + enwiki = self.enwiki + if not enwiki: + return + return 'https://en.wikipedia.org/wiki/' + enwiki.replace(' ', '_') + + @property + def sitelinks(self): + return self.entity['sitelinks'] + + @property + def claims(self): + return self.entity['claims'] + + @property + def enwiki(self): + return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None + + def get_categories(self): + titles = {'File:' + filename for filename in self.commons_filenames} + for commons_cat in self.commons_cats: + titles.add('Category:' + commons_cat) + if self.commons_sitelink: + titles.add(self.commons_sitelink) + if not titles: + return [] + + cat_list = mediawiki.get_categories(titles, 'commons') + + for title, cats in cat_list: + for cat in cats: + cat.set_item(self) + if not title.startswith('Category:'): + continue + self.parent_categories['commons'][utils.drop_category_ns(title)] = cats + + get_more_cats = [] + for _, cats in self.parent_categories['commons'].items(): + for cat in cats: + if cat.title not in self.parent_categories: + get_more_cats.append('Category:' + cat.title) + + for title, cats in mediawiki.get_categories(get_more_cats, 'commons'): + for cat in cats: + cat.set_item(self) + self.parent_categories['commons'][utils.drop_category_ns(title)] = cats + + if self.enwiki: + cat_list.append((self.enwiki, self.enwiki_categories)) + + get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories] + for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'): + self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats + + return cat_list + + def depicts_from_enwiki_content(self): + if not self.enwiki_url: + return + for par in self.enwiki_content.split('\n\n'): + m = re_from_article.search(par) + if m: + return m.group(1) + + def query_variables(self): + commons_cat = [] + cat_url = [] + keywords = [] + for _, categories in self.categories: + for cat in categories: + names = cat.names_for_wikidata() + keywords += names + if cat.site == 'commons': + commons_cat += names + cat_url += cat.urls_for_wikidata() + + text = self.depicts_from_enwiki_content() + if text: + sentences = nltk.sent_tokenize(text) + + for sentence in sentences: + for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): + if not utils.word_contains_letter(word): + continue + if not pos.startswith('NN'): + continue + word = word.strip('|') + for k in word.strip('|').split('|'): + if utils.word_contains_letter(k): + keywords += utils.also_singular(k) + + keywords = [k for k in keywords if utils.word_contains_letter(k)] + + return { + 'commons_cat': commons_cat, + 'cat_url': cat_url, + 'keywords': keywords, + } + + def build_query(self): + query_vars = self.query_variables() + sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']] + sitelinks = [url for url in sitelinks if url] + + q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat'])) + q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url'])) + q = q.replace('QID', self.qid) + q = q.replace('SITELINK', wdqs.url_list(sitelinks)) + return q + + def run_query(self): + query = self.build_query() + + rows = wdqs.run_query_with_cache(query) + by_id = {} + results = [] + for row in rows: + item_id = wdqs.row_id(row) + if item_id in ignore_for_depicts: + continue + if item_id in by_id: + by_id[item_id].update(row) + continue + hit = QueryResultRow(row) + by_id[item_id] = hit + results.append(hit) + + return sorted(results, key=lambda hit: hit.item_id) diff --git a/depicts/utils.py b/depicts/utils.py index f695aa4..5b8d4b9 100644 --- a/depicts/utils.py +++ b/depicts/utils.py @@ -1,4 +1,18 @@ from itertools import islice +import urllib.parse +import inflect + +hosts = { + 'commons': 'commons.wikimedia.org', + 'enwiki': 'en.wikipedia.org', + 'wikidata': 'www.wikidata.org', +} + +engine = inflect.engine() + +skip_names = { + 'National Gallery' +} def ordinal(n): return "%d%s" % (n, 'tsnrhtdd'[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10::4]) @@ -10,3 +24,50 @@ def chunk(it, size): def drop_start(s, start): assert s.startswith(start) return s[len(start):] + +def drop_category_ns(s): + return drop_start(s, 'Category:') + +def word_contains_letter(word): + return any(c.isalpha() for c in word) + +def also_singular(name): + names = also_singular_main(name) + extra = [] + for n in names: + words = set(n.lower().split()) + for word in 'girl', 'boy': + if word in words: + extra.append(word) + if {'female', 'females', 'women'} & words: + extra.append('woman') + if {'male', 'males', 'men'} & words: + extra.append('man') + return [n for n in names + extra if n not in skip_names] + +def also_singular_main(name): + ''' + given a singular name return a list of both the plural and singular versions + just return the name if it isn't singular + ''' + singular = engine.singular_noun(name.strip('|')) + if not singular: + return [name] + n, s = name.lower(), singular.lower() + if (n == s or + n.replace('paintings', '') == s.replace('painting', '') or + n == 'venus' and s == 'venu'): + return [name] + return [name, singular] + +def wiki_url(title, site, ns=None): + host = hosts[site] + url_ns = ns + ':' if ns else '' + if not title: + return + if title[0].islower(): + title = title[0].upper() + title[1:] + + return f'https://{host}/wiki/' + url_ns + urllib.parse.quote(title.replace(' ', '_')) + + diff --git a/depicts/wdqs.py b/depicts/wdqs.py new file mode 100644 index 0000000..0be6753 --- /dev/null +++ b/depicts/wdqs.py @@ -0,0 +1,100 @@ +import requests +import json +import urllib.parse +import os +import dateutil.parser +import hashlib +from . import utils + +query_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql' +url_start = 'http://www.wikidata.org/entity/Q' +commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/' + +def row_id(row): + return int(utils.drop_start(row['item']['value'], url_start)) + +def get_row_value(row, field): + return row[field]['value'] if field in row else None + +def commons_uri_to_filename(uri): + return urllib.parse.unquote(utils.drop_start(uri, commons_start)) + +def run_query(query): + params = {'query': query, 'format': 'json'} + r = requests.post(query_url, data=params, stream=True) + assert r.status_code == 200 + return r + +def md5_query(query): + ''' generate the md5 hexdigest of a SPARQL query ''' + return hashlib.md5(query.encode('utf-8')).hexdigest() + +def run_query_with_cache(q, name=None): + if name is None: + name = md5_query(q) + filename = f'cache/{name}.json' + if os.path.exists(filename): + from_cache = json.load(open(filename)) + if isinstance(from_cache, dict) and from_cache.get('query') == q: + return from_cache['bindings'] + + r = run_query(q) + bindings = r.json()['results']['bindings'] + json.dump({'query': q, 'bindings': bindings}, + open(filename, 'w'), indent=2) + + return bindings + +def format_time(row_time, row_timeprecision): + t = dateutil.parser.parse(row_time['value']) + precision = int(row_timeprecision['value']) + + if precision == 9: + return t.year + if precision == 8: + return f'{t.year}s' + if precision == 7: + return f'{utils.ordinal((t.year // 100) + 1)} century' + if precision == 6: + return f'{utils.ordinal((t.year // 1000) + 1)} millennium' + + return row_time['value'] + +def build_browse_item_map(bindings): + item_map = {} + for row in bindings: + item_id = row_id(row) + row_qid = f'Q{item_id}' + label = row['itemLabel']['value'] + image_filename = commons_uri_to_filename(row['image']['value']) + if item_id in item_map: + item = item_map[item_id] + item['image_filename'].append(image_filename) + continue + + if label == row_qid: + label = get_row_value(row, 'title') or 'name missing' + + artist_name = get_row_value(row, 'artistLabel') or '[artist unknown]' + + d = format_time(row['time'], row['timeprecision']) if 'time' in row else None + + item = { + 'image_filename': [image_filename], + 'item_id': item_id, + 'qid': row_qid, + 'label': label, + 'date': d, + 'artist_name': artist_name, + } + item_map[item_id] = item + + return item_map + +def quote_list(l): + no_dups = list(dict.fromkeys(l)) # remove duplicates + return ' '.join('("' + s.replace('"', '\\"') + '")' for s in no_dups) + +def url_list(l): + no_dups = list(dict.fromkeys(l)) # remove duplicates + return ' '.join(f'(<{s}>)' for s in no_dups) diff --git a/templates/find_more.html b/templates/find_more.html index 163a7af..0429887 100644 --- a/templates/find_more.html +++ b/templates/find_more.html @@ -24,7 +24,7 @@ {% for key, values in facets.items() %}

{{ prop_labels[key] }}: {% for v in values %} - {{ v.label }} ({{ v.count }}) + {{ v.label }} ({{ v.count }}) {% if not loop.last %}|{% endif %} {% endfor %}

diff --git a/templates/index.html b/templates/index.html index 010671c..3f796a8 100644 --- a/templates/index.html +++ b/templates/index.html @@ -4,6 +4,7 @@ {% block content %}
+

random painting

    {% for pid, label in props.items() %}
  • {{ label }} diff --git a/templates/item.html b/templates/item.html new file mode 100644 index 0000000..016bc8e --- /dev/null +++ b/templates/item.html @@ -0,0 +1,34 @@ +{% extends "base.html" %} + +{% block title %}{{ label }} ({{qid }}){% endblock %} + +{% block content %} +
    +

    {{ self.title() }}

    +
    +
    + +
    + +
    +

    view on Wikidata

    +

    random painting

    + {% for hit in hits %} +

    + url: {{ hit.url }}
    + label: {{ hit.label }}
    + qid: {{ hit.qid }}
    + sources: {{ hit.sources() }}
    +

    + {% endfor %} + +
    +
    + +
    {{ item.query_variables() | pprint }}
    + +
    {{ item.build_query() }}
    + + +
    +{% endblock %} diff --git a/templates/property.html b/templates/property.html index 2ed1e49..b3df277 100644 --- a/templates/property.html +++ b/templates/property.html @@ -10,9 +10,9 @@

    Sort order: {% if order == 'name' %} - name or count + name or count {% else %} - name or count + name or count {% endif %}

    @@ -21,7 +21,7 @@ {% set qid = row.object.value.rpartition('/')[2] %} {% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %}
  • - {{ row_label }} + {{ row_label }} {% if 'objectDescription' in row %} — {{ row.objectDescription.value }} {% endif %}