Refactor
This commit is contained in:
parent
9299909e2f
commit
dc9989157c
|
@ -7,7 +7,6 @@ class Artwork:
|
||||||
|
|
||||||
sites = ['commons', 'enwiki']
|
sites = ['commons', 'enwiki']
|
||||||
self.parent_categories = {site: {} for site in sites}
|
self.parent_categories = {site: {} for site in sites}
|
||||||
self.categories = self.get_categories()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def image_filename(self):
|
def image_filename(self):
|
||||||
|
|
110
depicts/category.py
Normal file
110
depicts/category.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
from . import utils
|
||||||
|
import re
|
||||||
|
import calendar
|
||||||
|
|
||||||
|
month_pattern = '|'.join(m for m in calendar.month_name if m)
|
||||||
|
re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
|
||||||
|
|
||||||
|
ns_cat = 'Category:'
|
||||||
|
|
||||||
|
class Category:
|
||||||
|
def __init__(self, title, site):
|
||||||
|
if title.startswith(ns_cat):
|
||||||
|
title = title[len(ns_cat):]
|
||||||
|
self.title = title
|
||||||
|
self.site = site
|
||||||
|
self.item = None
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
|
||||||
|
|
||||||
|
def set_item(self, item):
|
||||||
|
self.item = item
|
||||||
|
|
||||||
|
@property
|
||||||
|
def url(self):
|
||||||
|
return utils.wiki_url(self.title, self.site, ns='Category')
|
||||||
|
|
||||||
|
def date_based(self):
|
||||||
|
return bool(re_date_based.match(self.title))
|
||||||
|
|
||||||
|
def contains_artist_name(self):
|
||||||
|
if not self.item:
|
||||||
|
return
|
||||||
|
return any(artist.lower() in self.title.lower()
|
||||||
|
for artist in self.item.artist_labels())
|
||||||
|
|
||||||
|
def parents(self):
|
||||||
|
if not self.item:
|
||||||
|
return []
|
||||||
|
return self.item.parent_categories[self.site].get(self.title, [])
|
||||||
|
|
||||||
|
def is_exhibition(self):
|
||||||
|
return any(parent.title.startswith('Art exhibitions ')
|
||||||
|
for parent in self.parents())
|
||||||
|
|
||||||
|
def names_for_wikidata(self):
|
||||||
|
highlight = self.check()
|
||||||
|
interesting = len(highlight) > 1
|
||||||
|
|
||||||
|
if not interesting:
|
||||||
|
if self.date_based() or self.contains_artist_name() or self.is_exhibition():
|
||||||
|
return []
|
||||||
|
|
||||||
|
return utils.also_singular(self.title)
|
||||||
|
|
||||||
|
for significant, text in highlight:
|
||||||
|
if not significant:
|
||||||
|
continue
|
||||||
|
title = text.strip()
|
||||||
|
title = title[0].upper() + title[1:]
|
||||||
|
for sep in ' with ', ' at ', ' wearing ':
|
||||||
|
if sep in title:
|
||||||
|
before, _, after = title.partition(sep)
|
||||||
|
names = []
|
||||||
|
for x in title, before, after:
|
||||||
|
names += utils.also_singular(x)
|
||||||
|
return names
|
||||||
|
return utils.also_singular(title)
|
||||||
|
|
||||||
|
def urls_for_wikidata(self):
|
||||||
|
return [utils.wiki_url(name, self.site, ns='Category')
|
||||||
|
for name in self.names_for_wikidata()]
|
||||||
|
|
||||||
|
def check(self):
|
||||||
|
cat = self.title
|
||||||
|
lc_cat = cat.lower()
|
||||||
|
by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
|
||||||
|
'medium', 'year', 'painter']
|
||||||
|
|
||||||
|
if self.item:
|
||||||
|
by_endings += self.item.artist_labels()
|
||||||
|
|
||||||
|
for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
|
||||||
|
pos = lc_cat.find(after)
|
||||||
|
# don't highlight "1512 in art"
|
||||||
|
if pos == -1 or cat[:pos - 1].isdigit():
|
||||||
|
continue
|
||||||
|
return [(True, cat[:pos]), (False, cat[pos:])]
|
||||||
|
|
||||||
|
for before in ('paintings of', 'portraits of', 'landscapes of',
|
||||||
|
'portraits with', 'paintings with', 'paintings depicting',
|
||||||
|
'portraits depicting', 'landscapes depicting', 'works about'):
|
||||||
|
pos = lc_cat.find(before)
|
||||||
|
if pos == -1:
|
||||||
|
continue
|
||||||
|
pos += len(before)
|
||||||
|
for by_ending in by_endings:
|
||||||
|
ending = ' by ' + by_ending
|
||||||
|
if lc_cat.endswith(ending):
|
||||||
|
return [(False, cat[:pos]),
|
||||||
|
(True, cat[pos:-len(ending)]),
|
||||||
|
(False, cat[-len(ending):])]
|
||||||
|
|
||||||
|
return [(False, cat[:pos]), (True, cat[pos:])]
|
||||||
|
|
||||||
|
pos = lc_cat.find('of ')
|
||||||
|
if pos != -1:
|
||||||
|
return [(True, cat[:pos]), (False, cat[pos:])]
|
||||||
|
|
||||||
|
return [(False, cat)]
|
|
@ -2,6 +2,7 @@ import requests
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from .category import Category
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
wikidata_url = 'https://www.wikidata.org/w/api.php'
|
wikidata_url = 'https://www.wikidata.org/w/api.php'
|
||||||
|
@ -148,6 +149,26 @@ def get_content_and_categories(title, site):
|
||||||
def host_from_site(site):
|
def host_from_site(site):
|
||||||
return hosts[site]
|
return hosts[site]
|
||||||
|
|
||||||
|
def process_cats(cats, site):
|
||||||
|
return [Category(cat['title'], site) for cat in cats]
|
||||||
|
|
||||||
|
def get_categories(titles, site):
|
||||||
|
params = {
|
||||||
|
'prop': 'categories',
|
||||||
|
'clshow': '!hidden',
|
||||||
|
'cllimit': 'max',
|
||||||
|
}
|
||||||
|
from_wiki = mediawiki_query(titles, params, site)
|
||||||
|
title_and_cats = []
|
||||||
|
for i in from_wiki:
|
||||||
|
if 'categories' not in i:
|
||||||
|
continue
|
||||||
|
cats = process_cats(i['categories'], site)
|
||||||
|
if not cats:
|
||||||
|
continue
|
||||||
|
title_and_cats.append((i['title'], cats))
|
||||||
|
return title_and_cats
|
||||||
|
|
||||||
def get_history(title, site):
|
def get_history(title, site):
|
||||||
params = {
|
params = {
|
||||||
'prop': 'revisions',
|
'prop': 'revisions',
|
||||||
|
|
|
@ -2,6 +2,7 @@ from flask import request
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
import inflect
|
||||||
|
|
||||||
hosts = {
|
hosts = {
|
||||||
'commons': 'commons.wikimedia.org',
|
'commons': 'commons.wikimedia.org',
|
||||||
|
@ -9,6 +10,8 @@ hosts = {
|
||||||
'wikidata': 'www.wikidata.org',
|
'wikidata': 'www.wikidata.org',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
engine = inflect.engine()
|
||||||
|
|
||||||
skip_names = {
|
skip_names = {
|
||||||
'National Gallery'
|
'National Gallery'
|
||||||
}
|
}
|
||||||
|
@ -33,6 +36,35 @@ def parse_sitelink(s, start):
|
||||||
def word_contains_letter(word):
|
def word_contains_letter(word):
|
||||||
return any(c.isalpha() for c in word)
|
return any(c.isalpha() for c in word)
|
||||||
|
|
||||||
|
def also_singular(name):
|
||||||
|
names = also_singular_main(name)
|
||||||
|
extra = []
|
||||||
|
for n in names:
|
||||||
|
words = set(n.lower().split())
|
||||||
|
for word in 'girl', 'boy':
|
||||||
|
if word in words:
|
||||||
|
extra.append(word)
|
||||||
|
if {'female', 'females', 'women'} & words:
|
||||||
|
extra.append('woman')
|
||||||
|
if {'male', 'males', 'men'} & words:
|
||||||
|
extra.append('man')
|
||||||
|
return [n for n in names + extra if n not in skip_names]
|
||||||
|
|
||||||
|
def also_singular_main(name):
|
||||||
|
'''
|
||||||
|
given a singular name return a list of both the plural and singular versions
|
||||||
|
just return the name if it isn't singular
|
||||||
|
'''
|
||||||
|
singular = engine.singular_noun(name.strip('|'))
|
||||||
|
if not singular:
|
||||||
|
return [name]
|
||||||
|
n, s = name.lower(), singular.lower()
|
||||||
|
if (n == s or
|
||||||
|
n.replace('paintings', '') == s.replace('painting', '') or
|
||||||
|
n == 'venus' and s == 'venu'):
|
||||||
|
return [name]
|
||||||
|
return [name, singular]
|
||||||
|
|
||||||
def wiki_url(title, site, ns=None):
|
def wiki_url(title, site, ns=None):
|
||||||
host = hosts[site]
|
host = hosts[site]
|
||||||
url_ns = ns + ':' if ns else ''
|
url_ns = ns + ':' if ns else ''
|
||||||
|
|
Loading…
Reference in a new issue