Remove some unused code.
This commit is contained in:
parent
93df781115
commit
97ca0aaa34
|
@ -1,101 +1,12 @@
|
||||||
from . import utils, wdqs, mediawiki
|
from . import mediawiki
|
||||||
# import nltk
|
|
||||||
import re
|
|
||||||
|
|
||||||
re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I)
|
|
||||||
|
|
||||||
ignore_for_depicts = {
|
|
||||||
43445, # female organism - use: female (Q6581072)
|
|
||||||
44148, # male organism - use: male (Q6581097)
|
|
||||||
21075684, # children - use: child (Q7569)
|
|
||||||
180788, # National Gallery
|
|
||||||
780294, # human physical appearance
|
|
||||||
2472587, # people
|
|
||||||
33659, # People
|
|
||||||
}
|
|
||||||
|
|
||||||
query = '''
|
|
||||||
select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink
|
|
||||||
where {
|
|
||||||
service wikibase:label { bd:serviceParam wikibase:language "en" }
|
|
||||||
filter (?item != wd:QID)
|
|
||||||
|
|
||||||
{
|
|
||||||
VALUES (?commonscat) { COMMONS_CAT }
|
|
||||||
?item wdt:P373 ?commonscat .
|
|
||||||
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia category
|
|
||||||
filter not exists { ?item wdt:P31 wd:Q4167410 } # Wikimedia disambiguation page
|
|
||||||
filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs
|
|
||||||
filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia list article
|
|
||||||
filter not exists { ?item wdt:P31 wd:Q4663903 } # Wikimedia portal
|
|
||||||
} union {
|
|
||||||
VALUES (?commonscat) { COMMONS_CAT }
|
|
||||||
?cat_item wdt:P373 ?commonscat .
|
|
||||||
?cat_item wdt:P301 ?item .
|
|
||||||
} union {
|
|
||||||
VALUES (?cat_url) { CAT_URL }
|
|
||||||
?cat_url schema:about ?cat_item .
|
|
||||||
?cat_item wdt:P301 ?item .
|
|
||||||
} union {
|
|
||||||
VALUES (?sitelink) { SITELINK }
|
|
||||||
?sitelink schema:about ?item .
|
|
||||||
filter not exists { ?item wdt:P31 wd:Q4167410 }
|
|
||||||
}
|
|
||||||
}'''
|
|
||||||
|
|
||||||
class QueryResultRow:
|
|
||||||
def __init__(self, row):
|
|
||||||
self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()}
|
|
||||||
self.item_id = wdqs.row_id(row)
|
|
||||||
self.label = wdqs.get_row_value(row, 'itemLabel')
|
|
||||||
|
|
||||||
def update(self, row):
|
|
||||||
for key, value in row.items():
|
|
||||||
if key.startswith('item'):
|
|
||||||
continue
|
|
||||||
self.row.setdefault(key, []).append(value)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def url(self):
|
|
||||||
return self.row['item']['value']
|
|
||||||
|
|
||||||
@property
|
|
||||||
def qid(self):
|
|
||||||
return f'Q{self.item_id}'
|
|
||||||
|
|
||||||
def sources(self):
|
|
||||||
return {k: v for k, v in self.row.items() if not k.startswith('item')}
|
|
||||||
|
|
||||||
def sources_list(self):
|
|
||||||
|
|
||||||
def get_value(i):
|
|
||||||
if i['type'] != 'uri':
|
|
||||||
return i['value']
|
|
||||||
wiki_start = i['value'].rfind('/wiki/')
|
|
||||||
return i['value'][wiki_start + 6:]
|
|
||||||
|
|
||||||
return [(k, [get_value(i) for i in v])
|
|
||||||
for k, v in self.row.items()
|
|
||||||
if not k.startswith('item')]
|
|
||||||
|
|
||||||
class Artwork:
|
class Artwork:
|
||||||
def __init__(self, qid):
|
def __init__(self, qid):
|
||||||
self.entity = mediawiki.get_entity_with_cache(qid)
|
self.entity = mediawiki.get_entity_with_cache(qid)
|
||||||
self.item_id = int(qid[1:])
|
self.item_id = int(qid[1:])
|
||||||
|
|
||||||
if self.enwiki:
|
|
||||||
content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki')
|
|
||||||
self.enwiki_content = content
|
|
||||||
self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki')
|
|
||||||
for cat in self.enwiki_categories:
|
|
||||||
cat.set_item(self)
|
|
||||||
else:
|
|
||||||
self.enwiki_content = None
|
|
||||||
self.enwiki_categories = None
|
|
||||||
|
|
||||||
sites = ['commons', 'enwiki']
|
sites = ['commons', 'enwiki']
|
||||||
self.parent_categories = {site: {} for site in sites}
|
self.parent_categories = {site: {} for site in sites}
|
||||||
|
|
||||||
self.categories = self.get_categories()
|
self.categories = self.get_categories()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -194,115 +105,3 @@ class Artwork:
|
||||||
@property
|
@property
|
||||||
def enwiki(self):
|
def enwiki(self):
|
||||||
return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
|
return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
|
||||||
|
|
||||||
def get_categories(self):
|
|
||||||
titles = {'File:' + filename for filename in self.commons_filenames}
|
|
||||||
for commons_cat in self.commons_cats:
|
|
||||||
titles.add('Category:' + commons_cat)
|
|
||||||
if self.commons_sitelink:
|
|
||||||
titles.add(self.commons_sitelink)
|
|
||||||
if not titles:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# cat_list = mediawiki.get_categories(titles, 'commons')
|
|
||||||
cat_list = []
|
|
||||||
|
|
||||||
for title, cats in cat_list:
|
|
||||||
for cat in cats:
|
|
||||||
cat.set_item(self)
|
|
||||||
if not title.startswith('Category:'):
|
|
||||||
continue
|
|
||||||
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
|
|
||||||
|
|
||||||
get_more_cats = []
|
|
||||||
for _, cats in self.parent_categories['commons'].items():
|
|
||||||
for cat in cats:
|
|
||||||
if cat.title not in self.parent_categories:
|
|
||||||
get_more_cats.append('Category:' + cat.title)
|
|
||||||
|
|
||||||
for title, cats in mediawiki.get_categories(get_more_cats, 'commons'):
|
|
||||||
for cat in cats:
|
|
||||||
cat.set_item(self)
|
|
||||||
self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
|
|
||||||
|
|
||||||
if self.enwiki:
|
|
||||||
cat_list.append((self.enwiki, self.enwiki_categories))
|
|
||||||
|
|
||||||
get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories]
|
|
||||||
for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'):
|
|
||||||
self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats
|
|
||||||
|
|
||||||
return cat_list
|
|
||||||
|
|
||||||
def depicts_from_enwiki_content(self):
|
|
||||||
if not self.enwiki_url:
|
|
||||||
return
|
|
||||||
for par in self.enwiki_content.split('\n\n'):
|
|
||||||
m = re_from_article.search(par)
|
|
||||||
if m:
|
|
||||||
return m.group(1)
|
|
||||||
|
|
||||||
def query_variables(self):
|
|
||||||
commons_cat = []
|
|
||||||
cat_url = []
|
|
||||||
keywords = []
|
|
||||||
for _, categories in self.categories:
|
|
||||||
for cat in categories:
|
|
||||||
names = cat.names_for_wikidata()
|
|
||||||
keywords += names
|
|
||||||
if cat.site == 'commons':
|
|
||||||
commons_cat += names
|
|
||||||
cat_url += cat.urls_for_wikidata()
|
|
||||||
|
|
||||||
text = self.depicts_from_enwiki_content()
|
|
||||||
if text:
|
|
||||||
sentences = nltk.sent_tokenize(text)
|
|
||||||
|
|
||||||
for sentence in sentences:
|
|
||||||
for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
|
|
||||||
if not utils.word_contains_letter(word):
|
|
||||||
continue
|
|
||||||
if not pos.startswith('NN'):
|
|
||||||
continue
|
|
||||||
word = word.strip('|')
|
|
||||||
for k in word.strip('|').split('|'):
|
|
||||||
if utils.word_contains_letter(k):
|
|
||||||
keywords += utils.also_singular(k)
|
|
||||||
|
|
||||||
keywords = [k for k in keywords if utils.word_contains_letter(k)]
|
|
||||||
|
|
||||||
return {
|
|
||||||
'commons_cat': commons_cat,
|
|
||||||
'cat_url': cat_url,
|
|
||||||
'keywords': keywords,
|
|
||||||
}
|
|
||||||
|
|
||||||
def build_query(self):
|
|
||||||
query_vars = self.query_variables()
|
|
||||||
sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']]
|
|
||||||
sitelinks = [url for url in sitelinks if url]
|
|
||||||
|
|
||||||
q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat']))
|
|
||||||
q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url']))
|
|
||||||
q = q.replace('QID', self.qid)
|
|
||||||
q = q.replace('SITELINK', wdqs.url_list(sitelinks))
|
|
||||||
return q
|
|
||||||
|
|
||||||
def run_query(self):
|
|
||||||
query = self.build_query()
|
|
||||||
|
|
||||||
rows = wdqs.run_query_with_cache(query)
|
|
||||||
by_id = {}
|
|
||||||
results = []
|
|
||||||
for row in rows:
|
|
||||||
item_id = wdqs.row_id(row)
|
|
||||||
if item_id in ignore_for_depicts:
|
|
||||||
continue
|
|
||||||
if item_id in by_id:
|
|
||||||
by_id[item_id].update(row)
|
|
||||||
continue
|
|
||||||
hit = QueryResultRow(row)
|
|
||||||
by_id[item_id] = hit
|
|
||||||
results.append(hit)
|
|
||||||
|
|
||||||
return sorted(results, key=lambda hit: hit.item_id)
|
|
||||||
|
|
|
@ -1,110 +0,0 @@
|
||||||
from . import utils
|
|
||||||
import re
|
|
||||||
import calendar
|
|
||||||
|
|
||||||
month_pattern = '|'.join(m for m in calendar.month_name if m)
|
|
||||||
re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
|
|
||||||
|
|
||||||
ns_cat = 'Category:'
|
|
||||||
|
|
||||||
class Category:
|
|
||||||
def __init__(self, title, site):
|
|
||||||
if title.startswith(ns_cat):
|
|
||||||
title = title[len(ns_cat):]
|
|
||||||
self.title = title
|
|
||||||
self.site = site
|
|
||||||
self.item = None
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
|
|
||||||
|
|
||||||
def set_item(self, item):
|
|
||||||
self.item = item
|
|
||||||
|
|
||||||
@property
|
|
||||||
def url(self):
|
|
||||||
return utils.wiki_url(self.title, self.site, ns='Category')
|
|
||||||
|
|
||||||
def date_based(self):
|
|
||||||
return bool(re_date_based.match(self.title))
|
|
||||||
|
|
||||||
def contains_artist_name(self):
|
|
||||||
if not self.item:
|
|
||||||
return
|
|
||||||
return any(artist.lower() in self.title.lower()
|
|
||||||
for artist in self.item.artist_labels())
|
|
||||||
|
|
||||||
def parents(self):
|
|
||||||
if not self.item:
|
|
||||||
return []
|
|
||||||
return self.item.parent_categories[self.site].get(self.title, [])
|
|
||||||
|
|
||||||
def is_exhibition(self):
|
|
||||||
return any(parent.title.startswith('Art exhibitions ')
|
|
||||||
for parent in self.parents())
|
|
||||||
|
|
||||||
def names_for_wikidata(self):
|
|
||||||
highlight = self.check()
|
|
||||||
interesting = len(highlight) > 1
|
|
||||||
|
|
||||||
if not interesting:
|
|
||||||
if self.date_based() or self.contains_artist_name() or self.is_exhibition():
|
|
||||||
return []
|
|
||||||
|
|
||||||
return utils.also_singular(self.title)
|
|
||||||
|
|
||||||
for significant, text in highlight:
|
|
||||||
if not significant:
|
|
||||||
continue
|
|
||||||
title = text.strip()
|
|
||||||
title = title[0].upper() + title[1:]
|
|
||||||
for sep in ' with ', ' at ', ' wearing ':
|
|
||||||
if sep in title:
|
|
||||||
before, _, after = title.partition(sep)
|
|
||||||
names = []
|
|
||||||
for x in title, before, after:
|
|
||||||
names += utils.also_singular(x)
|
|
||||||
return names
|
|
||||||
return utils.also_singular(title)
|
|
||||||
|
|
||||||
def urls_for_wikidata(self):
|
|
||||||
return [utils.wiki_url(name, self.site, ns='Category')
|
|
||||||
for name in self.names_for_wikidata()]
|
|
||||||
|
|
||||||
def check(self):
|
|
||||||
cat = self.title
|
|
||||||
lc_cat = cat.lower()
|
|
||||||
by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
|
|
||||||
'medium', 'year', 'painter']
|
|
||||||
|
|
||||||
if self.item:
|
|
||||||
by_endings += self.item.artist_labels()
|
|
||||||
|
|
||||||
for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
|
|
||||||
pos = lc_cat.find(after)
|
|
||||||
# don't highlight "1512 in art"
|
|
||||||
if pos == -1 or cat[:pos - 1].isdigit():
|
|
||||||
continue
|
|
||||||
return [(True, cat[:pos]), (False, cat[pos:])]
|
|
||||||
|
|
||||||
for before in ('paintings of', 'portraits of', 'landscapes of',
|
|
||||||
'portraits with', 'paintings with', 'paintings depicting',
|
|
||||||
'portraits depicting', 'landscapes depicting', 'works about'):
|
|
||||||
pos = lc_cat.find(before)
|
|
||||||
if pos == -1:
|
|
||||||
continue
|
|
||||||
pos += len(before)
|
|
||||||
for by_ending in by_endings:
|
|
||||||
ending = ' by ' + by_ending
|
|
||||||
if lc_cat.endswith(ending):
|
|
||||||
return [(False, cat[:pos]),
|
|
||||||
(True, cat[pos:-len(ending)]),
|
|
||||||
(False, cat[-len(ending):])]
|
|
||||||
|
|
||||||
return [(False, cat[:pos]), (True, cat[pos:])]
|
|
||||||
|
|
||||||
pos = lc_cat.find('of ')
|
|
||||||
if pos != -1:
|
|
||||||
return [(True, cat[:pos]), (False, cat[pos:])]
|
|
||||||
|
|
||||||
return [(False, cat)]
|
|
|
@ -2,7 +2,6 @@ import requests
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
from .category import Category
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
wikidata_url = 'https://www.wikidata.org/w/api.php'
|
wikidata_url = 'https://www.wikidata.org/w/api.php'
|
||||||
|
@ -145,26 +144,6 @@ def get_content_and_categories(title, site):
|
||||||
def host_from_site(site):
|
def host_from_site(site):
|
||||||
return hosts[site]
|
return hosts[site]
|
||||||
|
|
||||||
def process_cats(cats, site):
|
|
||||||
return [Category(cat['title'], site) for cat in cats]
|
|
||||||
|
|
||||||
def get_categories(titles, site):
|
|
||||||
params = {
|
|
||||||
'prop': 'categories',
|
|
||||||
'clshow': '!hidden',
|
|
||||||
'cllimit': 'max',
|
|
||||||
}
|
|
||||||
from_wiki = mediawiki_query(titles, params, site)
|
|
||||||
title_and_cats = []
|
|
||||||
for i in from_wiki:
|
|
||||||
if 'categories' not in i:
|
|
||||||
continue
|
|
||||||
cats = process_cats(i['categories'], site)
|
|
||||||
if not cats:
|
|
||||||
continue
|
|
||||||
title_and_cats.append((i['title'], cats))
|
|
||||||
return title_and_cats
|
|
||||||
|
|
||||||
def get_history(title, site):
|
def get_history(title, site):
|
||||||
params = {
|
params = {
|
||||||
'prop': 'revisions',
|
'prop': 'revisions',
|
||||||
|
|
|
@ -2,7 +2,6 @@ from flask import request
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import inflect
|
|
||||||
|
|
||||||
hosts = {
|
hosts = {
|
||||||
'commons': 'commons.wikimedia.org',
|
'commons': 'commons.wikimedia.org',
|
||||||
|
@ -10,8 +9,6 @@ hosts = {
|
||||||
'wikidata': 'www.wikidata.org',
|
'wikidata': 'www.wikidata.org',
|
||||||
}
|
}
|
||||||
|
|
||||||
engine = inflect.engine()
|
|
||||||
|
|
||||||
skip_names = {
|
skip_names = {
|
||||||
'National Gallery'
|
'National Gallery'
|
||||||
}
|
}
|
||||||
|
@ -30,38 +27,12 @@ def drop_start(s, start):
|
||||||
def drop_category_ns(s):
|
def drop_category_ns(s):
|
||||||
return drop_start(s, 'Category:')
|
return drop_start(s, 'Category:')
|
||||||
|
|
||||||
|
def parse_sitelink(s, start):
|
||||||
|
return urllib.parse.unquote(drop_start(s, start)).replace('_', ' ')
|
||||||
|
|
||||||
def word_contains_letter(word):
|
def word_contains_letter(word):
|
||||||
return any(c.isalpha() for c in word)
|
return any(c.isalpha() for c in word)
|
||||||
|
|
||||||
def also_singular(name):
|
|
||||||
names = also_singular_main(name)
|
|
||||||
extra = []
|
|
||||||
for n in names:
|
|
||||||
words = set(n.lower().split())
|
|
||||||
for word in 'girl', 'boy':
|
|
||||||
if word in words:
|
|
||||||
extra.append(word)
|
|
||||||
if {'female', 'females', 'women'} & words:
|
|
||||||
extra.append('woman')
|
|
||||||
if {'male', 'males', 'men'} & words:
|
|
||||||
extra.append('man')
|
|
||||||
return [n for n in names + extra if n not in skip_names]
|
|
||||||
|
|
||||||
def also_singular_main(name):
|
|
||||||
'''
|
|
||||||
given a singular name return a list of both the plural and singular versions
|
|
||||||
just return the name if it isn't singular
|
|
||||||
'''
|
|
||||||
singular = engine.singular_noun(name.strip('|'))
|
|
||||||
if not singular:
|
|
||||||
return [name]
|
|
||||||
n, s = name.lower(), singular.lower()
|
|
||||||
if (n == s or
|
|
||||||
n.replace('paintings', '') == s.replace('painting', '') or
|
|
||||||
n == 'venus' and s == 'venu'):
|
|
||||||
return [name]
|
|
||||||
return [name, singular]
|
|
||||||
|
|
||||||
def wiki_url(title, site, ns=None):
|
def wiki_url(title, site, ns=None):
|
||||||
host = hosts[site]
|
host = hosts[site]
|
||||||
url_ns = ns + ':' if ns else ''
|
url_ns = ns + ':' if ns else ''
|
||||||
|
|
Loading…
Reference in a new issue