309 lines
9.9 KiB
Python
309 lines
9.9 KiB
Python
from . import utils, wdqs, mediawiki
|
|
import nltk
|
|
import re
|
|
|
|
re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I)
|
|
|
|
ignore_for_depicts = {
|
|
43445, # female organism - use: female (Q6581072)
|
|
44148, # male organism - use: male (Q6581097)
|
|
21075684, # children - use: child (Q7569)
|
|
180788, # National Gallery
|
|
780294, # human physical appearance
|
|
2472587, # people
|
|
33659, # People
|
|
}
|
|
|
|
query = '''
|
|
select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink
|
|
where {
|
|
service wikibase:label { bd:serviceParam wikibase:language "en" }
|
|
filter (?item != wd:QID)
|
|
|
|
{
|
|
VALUES (?commonscat) { COMMONS_CAT }
|
|
?item wdt:P373 ?commonscat .
|
|
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia category
|
|
filter not exists { ?item wdt:P31 wd:Q4167410 } # Wikimedia disambiguation page
|
|
filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs
|
|
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia list article
|
|
filter not exists { ?item wdt:P31 wd:Q4663903 } # Wikimedia portal
|
|
} union {
|
|
VALUES (?commonscat) { COMMONS_CAT }
|
|
?cat_item wdt:P373 ?commonscat .
|
|
?cat_item wdt:P301 ?item .
|
|
} union {
|
|
VALUES (?cat_url) { CAT_URL }
|
|
?cat_url schema:about ?cat_item .
|
|
?cat_item wdt:P301 ?item .
|
|
} union {
|
|
VALUES (?sitelink) { SITELINK }
|
|
?sitelink schema:about ?item .
|
|
filter not exists { ?item wdt:P31 wd:Q4167410 }
|
|
}
|
|
}'''
|
|
|
|
class QueryResultRow:
|
|
def __init__(self, row):
|
|
self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()}
|
|
self.item_id = wdqs.row_id(row)
|
|
self.label = wdqs.get_row_value(row, 'itemLabel')
|
|
|
|
def update(self, row):
|
|
for key, value in row.items():
|
|
if key.startswith('item'):
|
|
continue
|
|
self.row.setdefault(key, []).append(value)
|
|
|
|
@property
|
|
def url(self):
|
|
return self.row['item']['value']
|
|
|
|
@property
|
|
def qid(self):
|
|
return f'Q{self.item_id}'
|
|
|
|
def sources(self):
|
|
return {k: v for k, v in self.row.items() if not k.startswith('item')}
|
|
|
|
def sources_list(self):
|
|
|
|
def get_value(i):
|
|
if i['type'] != 'uri':
|
|
return i['value']
|
|
wiki_start = i['value'].rfind('/wiki/')
|
|
return i['value'][wiki_start + 6:]
|
|
|
|
return [(k, [get_value(i) for i in v])
|
|
for k, v in self.row.items()
|
|
if not k.startswith('item')]
|
|
|
|
class Artwork:
|
|
def __init__(self, qid):
|
|
self.entity = mediawiki.get_entity_with_cache(qid)
|
|
self.item_id = int(qid[1:])
|
|
|
|
if self.enwiki:
|
|
content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki')
|
|
self.enwiki_content = content
|
|
self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki')
|
|
for cat in self.enwiki_categories:
|
|
cat.set_item(self)
|
|
else:
|
|
self.enwiki_content = None
|
|
self.enwiki_categories = None
|
|
|
|
sites = ['commons', 'enwiki']
|
|
self.parent_categories = {site: {} for site in sites}
|
|
|
|
self.categories = self.get_categories()
|
|
|
|
@property
|
|
def image_filename(self):
|
|
if 'P18' in self.entity['claims']:
|
|
return self.entity['claims']['P18'][0]['mainsnak']['datavalue']['value']
|
|
|
|
@property
|
|
def display_title(self):
|
|
if 'en' not in self.entity['labels']:
|
|
return self.qid
|
|
return f'{self.en_title} ({self.qid})'
|
|
|
|
@property
|
|
def url(self):
|
|
return 'https://www.wikidata.org/wiki/' + self.qid
|
|
|
|
def get_artist_entities(self):
|
|
self.artist_entities = []
|
|
|
|
for artist in self.artists_claim:
|
|
artist_qid = artist['id']
|
|
self.artist_entities.append(mediawiki.get_entity(artist_qid))
|
|
|
|
def artist_labels(self):
|
|
if not hasattr(self, 'artist_entities'):
|
|
self.get_artist_entities()
|
|
return [artist['labels']['en']['value'] for artist in self.artist_entities]
|
|
|
|
@property
|
|
def commons_cats(self):
|
|
return [i['mainsnak']['datavalue']['value']
|
|
for i in self.entity['claims'].get('P373', [])]
|
|
|
|
@property
|
|
def commons_sitelink(self):
|
|
return self.sitelinks['commons']['value'] if 'commons' in self.sitelinks else None
|
|
|
|
@property
|
|
def en_title(self):
|
|
if 'en' in self.entity['labels']:
|
|
return self.entity['labels']['en']['value']
|
|
else:
|
|
return self.qid
|
|
|
|
@property
|
|
def artists_claim(self):
|
|
return [image['mainsnak']['datavalue']['value']
|
|
for image in self.entity['claims'].get('P170', [])]
|
|
|
|
@property
|
|
def artists(self):
|
|
if not hasattr(self, 'artist_entities'):
|
|
self.get_artist_entities()
|
|
|
|
items = [image['mainsnak']['datavalue']['value']
|
|
for image in self.entity['claims'].get('P170', [])]
|
|
|
|
lookup = {artist['id']: artist['labels'] for artist in self.artist_entities}
|
|
|
|
for item in items:
|
|
item['labels'] = lookup[item['id']]
|
|
|
|
return items
|
|
|
|
@property
|
|
def qid(self):
|
|
return f'Q{self.item_id}'
|
|
|
|
@property
|
|
def commons_filenames(self):
|
|
return [image['mainsnak']['datavalue']['value']
|
|
for image in self.entity['claims'].get('P18', [])]
|
|
|
|
def commons_cat_from_sitelink(self):
|
|
ns = 'Category:'
|
|
if not self.commons_sitelink or not self.commons_sitelink.startswith(ns):
|
|
return
|
|
return self.commons_sitelink[len(ns):]
|
|
|
|
@property
|
|
def enwiki_url(self):
|
|
enwiki = self.enwiki
|
|
if not enwiki:
|
|
return
|
|
return 'https://en.wikipedia.org/wiki/' + enwiki.replace(' ', '_')
|
|
|
|
@property
|
|
def sitelinks(self):
|
|
return self.entity['sitelinks']
|
|
|
|
@property
|
|
def claims(self):
|
|
return self.entity['claims']
|
|
|
|
@property
|
|
def enwiki(self):
|
|
return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
|
|
|
|
def get_categories(self):
|
|
titles = {'File:' + filename for filename in self.commons_filenames}
|
|
for commons_cat in self.commons_cats:
|
|
titles.add('Category:' + commons_cat)
|
|
if self.commons_sitelink:
|
|
titles.add(self.commons_sitelink)
|
|
if not titles:
|
|
return []
|
|
|
|
# cat_list = mediawiki.get_categories(titles, 'commons')
|
|
cat_list = []
|
|
|
|
for title, cats in cat_list:
|
|
for cat in cats:
|
|
cat.set_item(self)
|
|
if not title.startswith('Category:'):
|
|
continue
|
|
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
|
|
|
|
get_more_cats = []
|
|
for _, cats in self.parent_categories['commons'].items():
|
|
for cat in cats:
|
|
if cat.title not in self.parent_categories:
|
|
get_more_cats.append('Category:' + cat.title)
|
|
|
|
for title, cats in mediawiki.get_categories(get_more_cats, 'commons'):
|
|
for cat in cats:
|
|
cat.set_item(self)
|
|
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
|
|
|
|
if self.enwiki:
|
|
cat_list.append((self.enwiki, self.enwiki_categories))
|
|
|
|
get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories]
|
|
for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'):
|
|
self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats
|
|
|
|
return cat_list
|
|
|
|
def depicts_from_enwiki_content(self):
|
|
if not self.enwiki_url:
|
|
return
|
|
for par in self.enwiki_content.split('\n\n'):
|
|
m = re_from_article.search(par)
|
|
if m:
|
|
return m.group(1)
|
|
|
|
def query_variables(self):
|
|
commons_cat = []
|
|
cat_url = []
|
|
keywords = []
|
|
for _, categories in self.categories:
|
|
for cat in categories:
|
|
names = cat.names_for_wikidata()
|
|
keywords += names
|
|
if cat.site == 'commons':
|
|
commons_cat += names
|
|
cat_url += cat.urls_for_wikidata()
|
|
|
|
text = self.depicts_from_enwiki_content()
|
|
if text:
|
|
sentences = nltk.sent_tokenize(text)
|
|
|
|
for sentence in sentences:
|
|
for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
|
|
if not utils.word_contains_letter(word):
|
|
continue
|
|
if not pos.startswith('NN'):
|
|
continue
|
|
word = word.strip('|')
|
|
for k in word.strip('|').split('|'):
|
|
if utils.word_contains_letter(k):
|
|
keywords += utils.also_singular(k)
|
|
|
|
keywords = [k for k in keywords if utils.word_contains_letter(k)]
|
|
|
|
return {
|
|
'commons_cat': commons_cat,
|
|
'cat_url': cat_url,
|
|
'keywords': keywords,
|
|
}
|
|
|
|
def build_query(self):
|
|
query_vars = self.query_variables()
|
|
sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']]
|
|
sitelinks = [url for url in sitelinks if url]
|
|
|
|
q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat']))
|
|
q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url']))
|
|
q = q.replace('QID', self.qid)
|
|
q = q.replace('SITELINK', wdqs.url_list(sitelinks))
|
|
return q
|
|
|
|
def run_query(self):
|
|
query = self.build_query()
|
|
|
|
rows = wdqs.run_query_with_cache(query)
|
|
by_id = {}
|
|
results = []
|
|
for row in rows:
|
|
item_id = wdqs.row_id(row)
|
|
if item_id in ignore_for_depicts:
|
|
continue
|
|
if item_id in by_id:
|
|
by_id[item_id].update(row)
|
|
continue
|
|
hit = QueryResultRow(row)
|
|
by_id[item_id] = hit
|
|
results.append(hit)
|
|
|
|
return sorted(results, key=lambda hit: hit.item_id)
|