Split up code
Add depicts guessing
Redirect for random image
This commit is contained in:
Edward Betts 2019-09-16 08:59:53 +01:00
parent 0719f441c7
commit a909b50329
10 changed files with 706 additions and 177 deletions

232
app.py
View file

@ -1,23 +1,15 @@
#!/usr/bin/python3 #!/usr/bin/python3
from flask import Flask, render_template, url_for, redirect, request from flask import Flask, render_template, url_for, redirect, request, g
from depicts import utils from depicts import utils, wdqs, commons, mediawiki, painting
import dateutil.parser
import urllib.parse
import requests
import json import json
import os import os
import locale import locale
import random
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
url_start = 'http://www.wikidata.org/entity/Q'
wikidata_url = 'https://www.wikidata.org/w/api.php'
commons_url = 'https://www.wikidata.org/w/api.php'
wikidata_query_api_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
thumbwidth = 300 thumbwidth = 300
thumbheight = 400
app = Flask(__name__) app = Flask(__name__)
@ -98,68 +90,30 @@ select ?object ?objectLabel ?objectDescription (count(*) as ?count) {
order by desc(?count) order by desc(?count)
''' '''
def run_wikidata_query(query): painting_no_depicts_query = '''
params = {'query': query, 'format': 'json'} select distinct ?item where {
r = requests.post(wikidata_query_api_url, data=params, stream=True) ?item wdt:P31 wd:Q3305213 .
assert r.status_code == 200 ?item wdt:P18 ?image .
return r filter not exists { ?item wdt:P180 ?depicts }
}
'''
def row_id(row): @app.template_global()
return int(utils.drop_start(row['item']['value'], url_start)) def set_url_args(**new_args):
args = request.view_args.copy()
args.update(request.args)
args.update(new_args)
args = {k: v for k, v in args.items() if v is not None}
return url_for(request.endpoint, **args)
def api_call(params, api_url=wikidata_url): @app.before_request
call_params = { def init_profile():
'format': 'json', g.profiling = []
'formatversion': 2,
**params,
}
r = requests.get(wikidata_url, params=call_params)
return r
def get_entity(qid):
json_data = api_call({'action': 'wbgetentities', 'ids': qid}).json()
try:
entity = list(json_data['entities'].values())[0]
except KeyError:
return
if 'missing' not in entity:
return entity
def get_entities(ids, **params):
if not ids:
return []
params = {
'action': 'wbgetentities',
'ids': '|'.join(ids),
**params,
}
r = api_call(params)
json_data = r.json()
return list(json_data['entities'].values())
@app.route("/") @app.route("/")
def index(): def index():
return render_template('index.html', props=find_more_props) return render_template('index.html', props=find_more_props)
def run_query_with_cache(q, name):
filename = f'cache/{name}.json'
if os.path.exists(filename):
from_cache = json.load(open(filename))
if isinstance(from_cache, dict) and from_cache.get('query') == q:
return from_cache['bindings']
r = run_wikidata_query(q)
bindings = r.json()['results']['bindings']
json.dump({'query': q, 'bindings': bindings},
open(filename, 'w'), indent=2)
return bindings
def get_row_value(row, field):
return row[field]['value'] if field in row else None
@app.route("/property/P<int:property_id>") @app.route("/property/P<int:property_id>")
def property_query_page(property_id): def property_query_page(property_id):
pid = f'P{property_id}' pid = f'P{property_id}'
@ -167,7 +121,7 @@ def property_query_page(property_id):
sort_by_name = sort and sort.lower().strip() == 'name' sort_by_name = sort and sort.lower().strip() == 'name'
q = property_query.replace('PID', pid) q = property_query.replace('PID', pid)
rows = run_query_with_cache(q, name=pid) rows = wdqs.run_query_with_cache(q, name=pid)
no_label_qid = [row['object']['value'].rpartition('/')[2] no_label_qid = [row['object']['value'].rpartition('/')[2]
for row in rows for row in rows
@ -198,10 +152,35 @@ def property_query_page(property_id):
pid=pid, pid=pid,
rows=rows) rows=rows)
@app.route('/random')
def random_painting():
rows = wdqs.run_query_with_cache(painting_no_depicts_query)
row = random.choice(rows)
item_id = wdqs.row_id(row)
return redirect(url_for('item_page', item_id=item_id))
@app.route("/item/Q<int:item_id>") @app.route("/item/Q<int:item_id>")
def item_page(item_id): def item_page(item_id):
qid = f'Q{item_id}' qid = f'Q{item_id}'
return render_template('item.html', qid=qid) item = painting.Painting(qid)
width = 800
image_filename = item.image_filename
filename = f'cache/{qid}_{width}_image.json'
if os.path.exists(filename):
detail = json.load(open(filename))
else:
detail = commons.image_detail([image_filename], thumbwidth=width)
json.dump(detail, open(filename, 'w'), indent=2)
hits = item.run_query()
return render_template('item.html',
qid=qid,
item=item,
image=detail[image_filename],
hits=hits,
title=item.display_title)
def get_entity_label(entity): def get_entity_label(entity):
if 'en' in entity['labels']: if 'en' in entity['labels']:
@ -223,57 +202,18 @@ def get_labels(keys, name=None):
labels = from_cache['labels'] labels = from_cache['labels']
if not labels: if not labels:
for cur in utils.chunk(keys, 50): for cur in utils.chunk(keys, 50):
labels += get_entities(cur, props='labels') labels += mediawiki.get_entities(cur, props='labels')
json.dump({'keys': keys, 'labels': labels}, json.dump({'keys': keys, 'labels': labels},
open(filename, 'w'), indent=2) open(filename, 'w'), indent=2)
return {entity['id']: get_entity_label(entity) for entity in labels} return {entity['id']: get_entity_label(entity) for entity in labels}
def get_entity_with_cache(qid):
filename = f'cache/{qid}.json'
if os.path.exists(filename):
entity = json.load(open(filename))
else:
entity = get_entity(qid)
json.dump(entity, open(filename, 'w'), indent=2)
return entity
def commons_uri_to_filename(uri):
return urllib.parse.unquote(utils.drop_start(uri, commons_start))
def image_detail(filenames, thumbheight=None, thumbwidth=None):
if not isinstance(filenames, list):
filenames = [filenames]
if not filenames:
return {}
params = {
'action': 'query',
'titles': '|'.join(f'File:{f}' for f in filenames),
'prop': 'imageinfo',
'iiprop': 'url',
}
if thumbheight is not None:
params['iiurlheight'] = thumbheight
if thumbwidth is not None:
params['iiurlwidth'] = thumbwidth
r = api_call(params, api_url=commons_url)
images = {}
for image in r.json()['query']['pages']:
filename = utils.drop_start(image['title'], 'File:')
images[filename] = image['imageinfo'][0]
return images
@app.route("/next/Q<int:item_id>") @app.route("/next/Q<int:item_id>")
def next_page(item_id): def next_page(item_id):
qid = f'Q{item_id}' qid = f'Q{item_id}'
entity = get_entity_with_cache(qid) entity = mediawiki.get_entity_with_cache(qid)
width = 800 width = 800
image_filename = entity['claims']['P18'][0]['mainsnak']['datavalue']['value'] image_filename = entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
@ -281,7 +221,7 @@ def next_page(item_id):
if os.path.exists(filename): if os.path.exists(filename):
detail = json.load(open(filename)) detail = json.load(open(filename))
else: else:
detail = image_detail([image_filename], thumbwidth=width) detail = commons.image_detail([image_filename], thumbwidth=width)
json.dump(detail, open(filename, 'w'), indent=2) json.dump(detail, open(filename, 'w'), indent=2)
other_items = set() other_items = set()
@ -311,8 +251,7 @@ def next_page(item_id):
@app.route('/P<int:property_id>/Q<int:item_id>') @app.route('/P<int:property_id>/Q<int:item_id>')
def find_more_page(property_id, item_id): def find_more_page(property_id, item_id):
pid, qid = f'P{property_id}', f'Q{item_id}' pid, qid = f'P{property_id}', f'Q{item_id}'
return redirect(url_for('browse_page', **{pid: qid}))
return redirect(url_for('browse_page') + f'?{pid}={qid}')
def get_facets(sparql_params, params): def get_facets(sparql_params, params):
flat = '_'.join(f'{pid}={qid}' for pid, qid in params) flat = '_'.join(f'{pid}={qid}' for pid, qid in params)
@ -323,9 +262,7 @@ def get_facets(sparql_params, params):
q = (facet_query.replace('PARAMS', sparql_params) q = (facet_query.replace('PARAMS', sparql_params)
.replace('PROPERTY_LIST', property_list)) .replace('PROPERTY_LIST', property_list))
# open(f'cache/{flat}_facets_query.sparql', 'w').write(q) bindings = wdqs.run_query_with_cache(q, flat + '_facets')
bindings = run_query_with_cache(q, flat + '_facets')
facets = {key: [] for key in find_more_props.keys()} facets = {key: [] for key in find_more_props.keys()}
for row in bindings: for row in bindings:
@ -342,21 +279,6 @@ def get_facets(sparql_params, params):
if values if values
} }
def format_time(row_time, row_timeprecision):
t = dateutil.parser.parse(row_time['value'])
precision = int(row_timeprecision['value'])
if precision == 9:
return t.year
if precision == 8:
return f'{t.year}s'
if precision == 7:
return f'{utils.ordinal((t.year // 100) + 1)} century'
if precision == 6:
return f'{utils.ordinal((t.year // 1000) + 1)} millennium'
return row_time['value']
@app.route('/browse') @app.route('/browse')
def browse_page(): def browse_page():
params = [(pid, qid) for pid, qid in request.args.items() params = [(pid, qid) for pid, qid in request.args.items()
@ -374,49 +296,14 @@ def browse_page():
sparql_params = ''.join( sparql_params = ''.join(
f'?item wdt:{pid} wd:{qid} .\n' for pid, qid in params) f'?item wdt:{pid} wd:{qid} .\n' for pid, qid in params)
query = find_more_query.replace('PARAMS', sparql_params) q = find_more_query.replace('PARAMS', sparql_params)
filename = f'cache/{flat}.json'
if os.path.exists(filename):
bindings = json.load(open(filename))
else:
r = run_wikidata_query(query)
bindings = r.json()['results']['bindings']
json.dump(bindings, open(filename, 'w'), indent=2)
bindings = wdqs.run_query_with_cache(q, flat)
facets = get_facets(sparql_params, params) facets = get_facets(sparql_params, params)
page_size = 45 page_size = 45
item_map = {} item_map = wdqs.build_browse_item_map(bindings)
for row in bindings:
item_id = row_id(row)
row_qid = f'Q{item_id}'
label = row['itemLabel']['value']
image_filename = commons_uri_to_filename(row['image']['value'])
if item_id in item_map:
item = item_map[item_id]
item['image_filename'].append(image_filename)
continue
if label == row_qid:
label = get_row_value('title') or 'name missing'
artist_name = get_row_value['artistLabel'] or '[artist unknown]'
d = format_time(row['time'], row['timeprecision']) if 'time' in row else None
item = {
'url': url_for('next_page', item_id=item_id),
'image_filename': [image_filename],
'item_id': item_id,
'qid': row_qid,
'label': label,
'date': d,
'artist_name': artist_name,
}
item_map[item_id] = item
items = [] items = []
for item in item_map.values(): for item in item_map.values():
if len(item['image_filename']) != 1: if len(item['image_filename']) != 1:
@ -432,14 +319,13 @@ def browse_page():
if os.path.exists(filename): if os.path.exists(filename):
detail = json.load(open(filename)) detail = json.load(open(filename))
else: else:
detail = image_detail(filenames, thumbwidth=thumbwidth) detail = commons.image_detail(filenames, thumbwidth=thumbwidth)
json.dump(detail, open(filename, 'w'), indent=2) json.dump(detail, open(filename, 'w'), indent=2)
for item in items: for item in items:
item['url'] = url_for('item_page', item_id=item['item_id'])
item['image'] = detail[item['image_filename']] item['image'] = detail[item['image_filename']]
total = len(bindings)
title = ' / '.join(item_labels[qid] for pid, qid in params) title = ' / '.join(item_labels[qid] for pid, qid in params)
return render_template('find_more.html', return render_template('find_more.html',
@ -448,8 +334,8 @@ def browse_page():
label=title, label=title,
labels=find_more_props, labels=find_more_props,
bindings=bindings, bindings=bindings,
items=items, total=len(bindings),
total=total) items=items)
if __name__ == "__main__": if __name__ == "__main__":

110
depicts/category.py Normal file
View file

@ -0,0 +1,110 @@
from . import utils
import re
import calendar
month_pattern = '|'.join(m for m in calendar.month_name if m)
re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
ns_cat = 'Category:'
class Category:
def __init__(self, title, site):
if title.startswith(ns_cat):
title = title[len(ns_cat):]
self.title = title
self.site = site
self.item = None
def __repr__(self):
return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
def set_item(self, item):
self.item = item
@property
def url(self):
return utils.wiki_url(self.title, self.site, ns='Category')
def date_based(self):
return bool(re_date_based.match(self.title))
def contains_artist_name(self):
if not self.item:
return
return any(artist.lower() in self.title.lower()
for artist in self.item.artist_labels())
def parents(self):
if not self.item:
return []
return self.item.parent_categories[self.site].get(self.title, [])
def is_exhibition(self):
return any(parent.title.startswith('Art exhibitions ')
for parent in self.parents())
def names_for_wikidata(self):
highlight = self.check()
interesting = len(highlight) > 1
if not interesting:
if self.date_based() or self.contains_artist_name() or self.is_exhibition():
return []
return utils.also_singular(self.title)
for significant, text in highlight:
if not significant:
continue
title = text.strip()
title = title[0].upper() + title[1:]
for sep in ' with ', ' at ', ' wearing ':
if sep in title:
before, _, after = title.partition(sep)
names = []
for x in title, before, after:
names += utils.also_singular(x)
return names
return utils.also_singular(title)
def urls_for_wikidata(self):
return [utils.wiki_url(name, self.site, ns='Category')
for name in self.names_for_wikidata()]
def check(self):
cat = self.title
lc_cat = cat.lower()
by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
'medium', 'year', 'painter']
if self.item:
by_endings += self.item.artist_labels()
for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
pos = lc_cat.find(after)
# don't highlight "1512 in art"
if pos == -1 or cat[:pos - 1].isdigit():
continue
return [(True, cat[:pos]), (False, cat[pos:])]
for before in ('paintings of', 'portraits of', 'landscapes of',
'portraits with', 'paintings with', 'paintings depicting',
'portraits depicting', 'landscapes depicting', 'works about'):
pos = lc_cat.find(before)
if pos == -1:
continue
pos += len(before)
for by_ending in by_endings:
ending = ' by ' + by_ending
if lc_cat.endswith(ending):
return [(False, cat[:pos]),
(True, cat[pos:-len(ending)]),
(False, cat[-len(ending):])]
return [(False, cat[:pos]), (True, cat[pos:])]
pos = lc_cat.find('of ')
if pos != -1:
return [(True, cat[:pos]), (False, cat[pos:])]
return [(False, cat)]

31
depicts/commons.py Normal file
View file

@ -0,0 +1,31 @@
from . import mediawiki, utils
commons_url = 'https://www.wikidata.org/w/api.php'
def image_detail(filenames, thumbheight=None, thumbwidth=None):
if not isinstance(filenames, list):
filenames = [filenames]
if not filenames:
return {}
params = {
'action': 'query',
'titles': '|'.join(f'File:{f}' for f in filenames),
'prop': 'imageinfo',
'iiprop': 'url',
}
if thumbheight is not None:
params['iiurlheight'] = thumbheight
if thumbwidth is not None:
params['iiurlwidth'] = thumbwidth
r = mediawiki.api_call(params, api_url=commons_url)
images = {}
for image in r.json()['query']['pages']:
filename = utils.drop_start(image['title'], 'File:')
images[filename] = image['imageinfo'][0]
return images

306
depicts/painting.py Normal file
View file

@ -0,0 +1,306 @@
from . import utils, wdqs, mediawiki
import nltk
import re
re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I)
ignore_for_depicts = {
43445, # female organism - use: female (Q6581072)
44148, # male organism - use: male (Q6581097)
21075684, # children - use: child (Q7569)
180788, # National Gallery
780294, # human physical appearance
2472587, # people
33659, # People
}
query = '''
select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink
where {
service wikibase:label { bd:serviceParam wikibase:language "en" }
filter (?item != wd:QID)
{
VALUES (?commonscat) { COMMONS_CAT }
?item wdt:P373 ?commonscat .
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia category
filter not exists { ?item wdt:P31 wd:Q4167410 } # Wikimedia disambiguation page
filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia list article
filter not exists { ?item wdt:P31 wd:Q4663903 } # Wikimedia portal
} union {
VALUES (?commonscat) { COMMONS_CAT }
?cat_item wdt:P373 ?commonscat .
?cat_item wdt:P301 ?item .
} union {
VALUES (?cat_url) { CAT_URL }
?cat_url schema:about ?cat_item .
?cat_item wdt:P301 ?item .
} union {
VALUES (?sitelink) { SITELINK }
?sitelink schema:about ?item .
filter not exists { ?item wdt:P31 wd:Q4167410 }
}
}'''
class QueryResultRow:
def __init__(self, row):
self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()}
self.item_id = wdqs.row_id(row)
self.label = wdqs.get_row_value(row, 'itemLabel')
def update(self, row):
for key, value in row.items():
if key.startswith('item'):
continue
self.row.setdefault(key, []).append(value)
@property
def url(self):
return self.row['item']['value']
@property
def qid(self):
return f'Q{self.item_id}'
def sources(self):
return {k: v for k, v in self.row.items() if not k.startswith('item')}
def sources_list(self):
def get_value(i):
if i['type'] != 'uri':
return i['value']
wiki_start = i['value'].rfind('/wiki/')
return i['value'][wiki_start + 6:]
return [(k, [get_value(i) for i in v])
for k, v in self.row.items()
if not k.startswith('item')]
class Painting:
def __init__(self, qid):
self.entity = mediawiki.get_entity_with_cache(qid)
self.item_id = int(qid[1:])
if self.enwiki:
content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki')
self.enwiki_content = content
self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki')
for cat in self.enwiki_categories:
cat.set_item(self)
else:
self.enwiki_content = None
self.enwiki_categories = None
sites = ['commons', 'enwiki']
self.parent_categories = {site: {} for site in sites}
self.categories = self.get_categories()
@property
def image_filename(self):
return self.entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
@property
def display_title(self):
if 'en' not in self.entity['labels']:
return self.qid
return f'{self.en_title} ({self.qid})'
@property
def url(self):
return 'https://www.wikidata.org/wiki/' + self.qid
def get_artist_entities(self):
self.artist_entities = []
for artist in self.artists_claim:
artist_qid = artist['id']
self.artist_entities.append(mediawiki.get_entity(artist_qid))
def artist_labels(self):
if not hasattr(self, 'artist_entities'):
self.get_artist_entities()
return [artist['labels']['en']['value'] for artist in self.artist_entities]
@property
def commons_cats(self):
return [i['mainsnak']['datavalue']['value']
for i in self.entity['claims'].get('P373', [])]
@property
def commons_sitelink(self):
return self.sitelinks['commons']['value'] if 'commons' in self.sitelinks else None
@property
def en_title(self):
if 'en' in self.entity['labels']:
return self.entity['labels']['en']['value']
else:
return self.qid
@property
def artists_claim(self):
return [image['mainsnak']['datavalue']['value']
for image in self.entity['claims'].get('P170', [])]
@property
def artists(self):
if not hasattr(self, 'artist_entities'):
self.get_artist_entities()
items = [image['mainsnak']['datavalue']['value']
for image in self.entity['claims'].get('P170', [])]
lookup = {artist['id']: artist['labels'] for artist in self.artist_entities}
for item in items:
item['labels'] = lookup[item['id']]
return items
@property
def qid(self):
return f'Q{self.item_id}'
@property
def commons_filenames(self):
return [image['mainsnak']['datavalue']['value']
for image in self.entity['claims'].get('P18', [])]
def commons_cat_from_sitelink(self):
ns = 'Category:'
if not self.commons_sitelink or not self.commons_sitelink.startswith(ns):
return
return self.commons_sitelink[len(ns):]
@property
def enwiki_url(self):
enwiki = self.enwiki
if not enwiki:
return
return 'https://en.wikipedia.org/wiki/' + enwiki.replace(' ', '_')
@property
def sitelinks(self):
return self.entity['sitelinks']
@property
def claims(self):
return self.entity['claims']
@property
def enwiki(self):
return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
def get_categories(self):
titles = {'File:' + filename for filename in self.commons_filenames}
for commons_cat in self.commons_cats:
titles.add('Category:' + commons_cat)
if self.commons_sitelink:
titles.add(self.commons_sitelink)
if not titles:
return []
cat_list = mediawiki.get_categories(titles, 'commons')
for title, cats in cat_list:
for cat in cats:
cat.set_item(self)
if not title.startswith('Category:'):
continue
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
get_more_cats = []
for _, cats in self.parent_categories['commons'].items():
for cat in cats:
if cat.title not in self.parent_categories:
get_more_cats.append('Category:' + cat.title)
for title, cats in mediawiki.get_categories(get_more_cats, 'commons'):
for cat in cats:
cat.set_item(self)
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
if self.enwiki:
cat_list.append((self.enwiki, self.enwiki_categories))
get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories]
for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'):
self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats
return cat_list
def depicts_from_enwiki_content(self):
if not self.enwiki_url:
return
for par in self.enwiki_content.split('\n\n'):
m = re_from_article.search(par)
if m:
return m.group(1)
def query_variables(self):
commons_cat = []
cat_url = []
keywords = []
for _, categories in self.categories:
for cat in categories:
names = cat.names_for_wikidata()
keywords += names
if cat.site == 'commons':
commons_cat += names
cat_url += cat.urls_for_wikidata()
text = self.depicts_from_enwiki_content()
if text:
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
if not utils.word_contains_letter(word):
continue
if not pos.startswith('NN'):
continue
word = word.strip('|')
for k in word.strip('|').split('|'):
if utils.word_contains_letter(k):
keywords += utils.also_singular(k)
keywords = [k for k in keywords if utils.word_contains_letter(k)]
return {
'commons_cat': commons_cat,
'cat_url': cat_url,
'keywords': keywords,
}
def build_query(self):
query_vars = self.query_variables()
sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']]
sitelinks = [url for url in sitelinks if url]
q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat']))
q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url']))
q = q.replace('QID', self.qid)
q = q.replace('SITELINK', wdqs.url_list(sitelinks))
return q
def run_query(self):
query = self.build_query()
rows = wdqs.run_query_with_cache(query)
by_id = {}
results = []
for row in rows:
item_id = wdqs.row_id(row)
if item_id in ignore_for_depicts:
continue
if item_id in by_id:
by_id[item_id].update(row)
continue
hit = QueryResultRow(row)
by_id[item_id] = hit
results.append(hit)
return sorted(results, key=lambda hit: hit.item_id)

View file

@ -1,4 +1,18 @@
from itertools import islice from itertools import islice
import urllib.parse
import inflect
hosts = {
'commons': 'commons.wikimedia.org',
'enwiki': 'en.wikipedia.org',
'wikidata': 'www.wikidata.org',
}
engine = inflect.engine()
skip_names = {
'National Gallery'
}
def ordinal(n): def ordinal(n):
return "%d%s" % (n, 'tsnrhtdd'[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10::4]) return "%d%s" % (n, 'tsnrhtdd'[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10::4])
@ -10,3 +24,50 @@ def chunk(it, size):
def drop_start(s, start): def drop_start(s, start):
assert s.startswith(start) assert s.startswith(start)
return s[len(start):] return s[len(start):]
def drop_category_ns(s):
return drop_start(s, 'Category:')
def word_contains_letter(word):
return any(c.isalpha() for c in word)
def also_singular(name):
names = also_singular_main(name)
extra = []
for n in names:
words = set(n.lower().split())
for word in 'girl', 'boy':
if word in words:
extra.append(word)
if {'female', 'females', 'women'} & words:
extra.append('woman')
if {'male', 'males', 'men'} & words:
extra.append('man')
return [n for n in names + extra if n not in skip_names]
def also_singular_main(name):
'''
given a singular name return a list of both the plural and singular versions
just return the name if it isn't singular
'''
singular = engine.singular_noun(name.strip('|'))
if not singular:
return [name]
n, s = name.lower(), singular.lower()
if (n == s or
n.replace('paintings', '') == s.replace('painting', '') or
n == 'venus' and s == 'venu'):
return [name]
return [name, singular]
def wiki_url(title, site, ns=None):
host = hosts[site]
url_ns = ns + ':' if ns else ''
if not title:
return
if title[0].islower():
title = title[0].upper() + title[1:]
return f'https://{host}/wiki/' + url_ns + urllib.parse.quote(title.replace(' ', '_'))

100
depicts/wdqs.py Normal file
View file

@ -0,0 +1,100 @@
import requests
import json
import urllib.parse
import os
import dateutil.parser
import hashlib
from . import utils
query_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
url_start = 'http://www.wikidata.org/entity/Q'
commons_start = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
def row_id(row):
return int(utils.drop_start(row['item']['value'], url_start))
def get_row_value(row, field):
return row[field]['value'] if field in row else None
def commons_uri_to_filename(uri):
return urllib.parse.unquote(utils.drop_start(uri, commons_start))
def run_query(query):
params = {'query': query, 'format': 'json'}
r = requests.post(query_url, data=params, stream=True)
assert r.status_code == 200
return r
def md5_query(query):
''' generate the md5 hexdigest of a SPARQL query '''
return hashlib.md5(query.encode('utf-8')).hexdigest()
def run_query_with_cache(q, name=None):
if name is None:
name = md5_query(q)
filename = f'cache/{name}.json'
if os.path.exists(filename):
from_cache = json.load(open(filename))
if isinstance(from_cache, dict) and from_cache.get('query') == q:
return from_cache['bindings']
r = run_query(q)
bindings = r.json()['results']['bindings']
json.dump({'query': q, 'bindings': bindings},
open(filename, 'w'), indent=2)
return bindings
def format_time(row_time, row_timeprecision):
t = dateutil.parser.parse(row_time['value'])
precision = int(row_timeprecision['value'])
if precision == 9:
return t.year
if precision == 8:
return f'{t.year}s'
if precision == 7:
return f'{utils.ordinal((t.year // 100) + 1)} century'
if precision == 6:
return f'{utils.ordinal((t.year // 1000) + 1)} millennium'
return row_time['value']
def build_browse_item_map(bindings):
item_map = {}
for row in bindings:
item_id = row_id(row)
row_qid = f'Q{item_id}'
label = row['itemLabel']['value']
image_filename = commons_uri_to_filename(row['image']['value'])
if item_id in item_map:
item = item_map[item_id]
item['image_filename'].append(image_filename)
continue
if label == row_qid:
label = get_row_value(row, 'title') or 'name missing'
artist_name = get_row_value(row, 'artistLabel') or '[artist unknown]'
d = format_time(row['time'], row['timeprecision']) if 'time' in row else None
item = {
'image_filename': [image_filename],
'item_id': item_id,
'qid': row_qid,
'label': label,
'date': d,
'artist_name': artist_name,
}
item_map[item_id] = item
return item_map
def quote_list(l):
no_dups = list(dict.fromkeys(l)) # remove duplicates
return ' '.join('("' + s.replace('"', '\\"') + '")' for s in no_dups)
def url_list(l):
no_dups = list(dict.fromkeys(l)) # remove duplicates
return ' '.join(f'(<{s}>)' for s in no_dups)

View file

@ -24,7 +24,7 @@
{% for key, values in facets.items() %} {% for key, values in facets.items() %}
<p>{{ prop_labels[key] }}: <p>{{ prop_labels[key] }}:
{% for v in values %} {% for v in values %}
<a href="?{{ request.query_string.decode('utf-8') }}&{{key}}={{v.qid}}">{{ v.label }}</a> ({{ v.count }}) <a href="{{ set_url_args(**{key: v.qid}) }}">{{ v.label }}</a> ({{ v.count }})
{% if not loop.last %}|{% endif %} {% if not loop.last %}|{% endif %}
{% endfor %} {% endfor %}
</p> </p>

View file

@ -4,6 +4,7 @@
{% block content %} {% block content %}
<div class="m-3"> <div class="m-3">
<p><a href="{{ url_for('random_painting') }}">random painting</a></p>
<ul> <ul>
{% for pid, label in props.items() %} {% for pid, label in props.items() %}
<li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a> <li><a href="{{ url_for('property_query_page', property_id=pid[1:]) }}">{{ label }}</a>

34
templates/item.html Normal file
View file

@ -0,0 +1,34 @@
{% extends "base.html" %}
{% block title %}{{ label }} ({{qid }}){% endblock %}
{% block content %}
<div class="m-3">
<h1>{{ self.title() }}</h1>
<div class="row">
<div class="col">
<img src="{{ image.thumburl }}">
</div>
<div class="col">
<p><a href="https://www.wikidata.org/wiki/{{ qid }}">view on Wikidata</a></p>
<p><a href="{{ url_for('random_painting') }}">random painting</a></p>
{% for hit in hits %}
<p>
url: {{ hit.url }}<br>
label: {{ hit.label }}<br>
qid: {{ hit.qid }}<br>
sources: {{ hit.sources() }}<br>
</p>
{% endfor %}
</div>
</div>
<pre>{{ item.query_variables() | pprint }}</pre>
<pre>{{ item.build_query() }}</pre>
</div>
{% endblock %}

View file

@ -10,9 +10,9 @@
<p>Sort order: <p>Sort order:
{% if order == 'name' %} {% if order == 'name' %}
<b>name</b> or <a href="?sort=count">count</a> <b>name</b> or <a href="{{ set_url_args(sort='count') }}">count</a>
{% else %} {% else %}
<a href="?sort=name">name</a> or <b>count</b> <a href="{{ set_url_args(sort='name') }}">name</a> or <b>count</b>
{% endif %} {% endif %}
</p> </p>
@ -21,7 +21,7 @@
{% set qid = row.object.value.rpartition('/')[2] %} {% set qid = row.object.value.rpartition('/')[2] %}
{% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %} {% set row_label = row.objectLabel.value if 'objectLabel' in row else '[ label missing ]' %}
<li> <li>
<a href="{{ url_for('browse_page') }}?{{ pid }}={{ qid }}">{{ row_label }}</a> <a href="{{ url_for('browse_page', **{pid: qid}) }}">{{ row_label }}</a>
{% if 'objectDescription' in row %} {% if 'objectDescription' in row %}
&mdash; {{ row.objectDescription.value }} &mdash; {{ row.objectDescription.value }}
{% endif %} {% endif %}