depicts/depicts/artwork.py
2019-12-18 18:52:26 +00:00

309 lines
9.9 KiB
Python

from . import utils, wdqs, mediawiki
# import nltk
import re
re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I)
ignore_for_depicts = {
43445, # female organism - use: female (Q6581072)
44148, # male organism - use: male (Q6581097)
21075684, # children - use: child (Q7569)
180788, # National Gallery
780294, # human physical appearance
2472587, # people
33659, # People
}
query = '''
select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink
where {
service wikibase:label { bd:serviceParam wikibase:language "en" }
filter (?item != wd:QID)
{
VALUES (?commonscat) { COMMONS_CAT }
?item wdt:P373 ?commonscat .
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia category
filter not exists { ?item wdt:P31 wd:Q4167410 } # Wikimedia disambiguation page
filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia list article
filter not exists { ?item wdt:P31 wd:Q4663903 } # Wikimedia portal
} union {
VALUES (?commonscat) { COMMONS_CAT }
?cat_item wdt:P373 ?commonscat .
?cat_item wdt:P301 ?item .
} union {
VALUES (?cat_url) { CAT_URL }
?cat_url schema:about ?cat_item .
?cat_item wdt:P301 ?item .
} union {
VALUES (?sitelink) { SITELINK }
?sitelink schema:about ?item .
filter not exists { ?item wdt:P31 wd:Q4167410 }
}
}'''
class QueryResultRow:
def __init__(self, row):
self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()}
self.item_id = wdqs.row_id(row)
self.label = wdqs.get_row_value(row, 'itemLabel')
def update(self, row):
for key, value in row.items():
if key.startswith('item'):
continue
self.row.setdefault(key, []).append(value)
@property
def url(self):
return self.row['item']['value']
@property
def qid(self):
return f'Q{self.item_id}'
def sources(self):
return {k: v for k, v in self.row.items() if not k.startswith('item')}
def sources_list(self):
def get_value(i):
if i['type'] != 'uri':
return i['value']
wiki_start = i['value'].rfind('/wiki/')
return i['value'][wiki_start + 6:]
return [(k, [get_value(i) for i in v])
for k, v in self.row.items()
if not k.startswith('item')]
class Artwork:
def __init__(self, qid):
self.entity = mediawiki.get_entity_with_cache(qid)
self.item_id = int(qid[1:])
if self.enwiki:
content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki')
self.enwiki_content = content
self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki')
for cat in self.enwiki_categories:
cat.set_item(self)
else:
self.enwiki_content = None
self.enwiki_categories = None
sites = ['commons', 'enwiki']
self.parent_categories = {site: {} for site in sites}
self.categories = self.get_categories()
@property
def image_filename(self):
if 'P18' in self.entity['claims']:
return self.entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
@property
def display_title(self):
if 'en' not in self.entity['labels']:
return self.qid
return f'{self.en_title} ({self.qid})'
@property
def url(self):
return 'https://www.wikidata.org/wiki/' + self.qid
def get_artist_entities(self):
self.artist_entities = []
for artist in self.artists_claim:
artist_qid = artist['id']
self.artist_entities.append(mediawiki.get_entity(artist_qid))
def artist_labels(self):
if not hasattr(self, 'artist_entities'):
self.get_artist_entities()
return [artist['labels']['en']['value'] for artist in self.artist_entities]
@property
def commons_cats(self):
return [i['mainsnak']['datavalue']['value']
for i in self.entity['claims'].get('P373', [])]
@property
def commons_sitelink(self):
return self.sitelinks['commons']['value'] if 'commons' in self.sitelinks else None
@property
def en_title(self):
if 'en' in self.entity['labels']:
return self.entity['labels']['en']['value']
else:
return self.qid
@property
def artists_claim(self):
return [image['mainsnak']['datavalue']['value']
for image in self.entity['claims'].get('P170', [])]
@property
def artists(self):
if not hasattr(self, 'artist_entities'):
self.get_artist_entities()
items = [image['mainsnak']['datavalue']['value']
for image in self.entity['claims'].get('P170', [])]
lookup = {artist['id']: artist['labels'] for artist in self.artist_entities}
for item in items:
item['labels'] = lookup[item['id']]
return items
@property
def qid(self):
return f'Q{self.item_id}'
@property
def commons_filenames(self):
return [image['mainsnak']['datavalue']['value']
for image in self.entity['claims'].get('P18', [])]
def commons_cat_from_sitelink(self):
ns = 'Category:'
if not self.commons_sitelink or not self.commons_sitelink.startswith(ns):
return
return self.commons_sitelink[len(ns):]
@property
def enwiki_url(self):
enwiki = self.enwiki
if not enwiki:
return
return 'https://en.wikipedia.org/wiki/' + enwiki.replace(' ', '_')
@property
def sitelinks(self):
return self.entity['sitelinks']
@property
def claims(self):
return self.entity['claims']
@property
def enwiki(self):
return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
def get_categories(self):
titles = {'File:' + filename for filename in self.commons_filenames}
for commons_cat in self.commons_cats:
titles.add('Category:' + commons_cat)
if self.commons_sitelink:
titles.add(self.commons_sitelink)
if not titles:
return []
# cat_list = mediawiki.get_categories(titles, 'commons')
cat_list = []
for title, cats in cat_list:
for cat in cats:
cat.set_item(self)
if not title.startswith('Category:'):
continue
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
get_more_cats = []
for _, cats in self.parent_categories['commons'].items():
for cat in cats:
if cat.title not in self.parent_categories:
get_more_cats.append('Category:' + cat.title)
for title, cats in mediawiki.get_categories(get_more_cats, 'commons'):
for cat in cats:
cat.set_item(self)
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
if self.enwiki:
cat_list.append((self.enwiki, self.enwiki_categories))
get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories]
for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'):
self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats
return cat_list
def depicts_from_enwiki_content(self):
if not self.enwiki_url:
return
for par in self.enwiki_content.split('\n\n'):
m = re_from_article.search(par)
if m:
return m.group(1)
def query_variables(self):
commons_cat = []
cat_url = []
keywords = []
for _, categories in self.categories:
for cat in categories:
names = cat.names_for_wikidata()
keywords += names
if cat.site == 'commons':
commons_cat += names
cat_url += cat.urls_for_wikidata()
text = self.depicts_from_enwiki_content()
if text:
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
if not utils.word_contains_letter(word):
continue
if not pos.startswith('NN'):
continue
word = word.strip('|')
for k in word.strip('|').split('|'):
if utils.word_contains_letter(k):
keywords += utils.also_singular(k)
keywords = [k for k in keywords if utils.word_contains_letter(k)]
return {
'commons_cat': commons_cat,
'cat_url': cat_url,
'keywords': keywords,
}
def build_query(self):
query_vars = self.query_variables()
sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']]
sitelinks = [url for url in sitelinks if url]
q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat']))
q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url']))
q = q.replace('QID', self.qid)
q = q.replace('SITELINK', wdqs.url_list(sitelinks))
return q
def run_query(self):
query = self.build_query()
rows = wdqs.run_query_with_cache(query)
by_id = {}
results = []
for row in rows:
item_id = wdqs.row_id(row)
if item_id in ignore_for_depicts:
continue
if item_id in by_id:
by_id[item_id].update(row)
continue
hit = QueryResultRow(row)
by_id[item_id] = hit
results.append(hit)
return sorted(results, key=lambda hit: hit.item_id)