From dc9989157caa5c1bccc10bbd1528693129df96e6 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Tue, 30 Jun 2020 09:01:57 +0100 Subject: [PATCH] Refactor --- depicts/artwork.py | 1 - depicts/category.py | 110 +++++++++++++++++++++++++++++++++++++++++++ depicts/mediawiki.py | 21 +++++++++ depicts/utils.py | 32 +++++++++++++ 4 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 depicts/category.py diff --git a/depicts/artwork.py b/depicts/artwork.py index 5536570..aa13fd8 100644 --- a/depicts/artwork.py +++ b/depicts/artwork.py @@ -7,7 +7,6 @@ class Artwork: sites = ['commons', 'enwiki'] self.parent_categories = {site: {} for site in sites} - self.categories = self.get_categories() @property def image_filename(self): diff --git a/depicts/category.py b/depicts/category.py new file mode 100644 index 0000000..2d350e0 --- /dev/null +++ b/depicts/category.py @@ -0,0 +1,110 @@ +from . import utils +import re +import calendar + +month_pattern = '|'.join(m for m in calendar.month_name if m) +re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ') + +ns_cat = 'Category:' + +class Category: + def __init__(self, title, site): + if title.startswith(ns_cat): + title = title[len(ns_cat):] + self.title = title + self.site = site + self.item = None + + def __repr__(self): + return f'{self.__class__.__name__}({self.title!r}, {self.site!r})' + + def set_item(self, item): + self.item = item + + @property + def url(self): + return utils.wiki_url(self.title, self.site, ns='Category') + + def date_based(self): + return bool(re_date_based.match(self.title)) + + def contains_artist_name(self): + if not self.item: + return + return any(artist.lower() in self.title.lower() + for artist in self.item.artist_labels()) + + def parents(self): + if not self.item: + return [] + return self.item.parent_categories[self.site].get(self.title, []) + + def is_exhibition(self): + return any(parent.title.startswith('Art exhibitions ') + for parent in self.parents()) + + def names_for_wikidata(self): + highlight = self.check() + interesting = len(highlight) > 1 + + if not interesting: + if self.date_based() or self.contains_artist_name() or self.is_exhibition(): + return [] + + return utils.also_singular(self.title) + + for significant, text in highlight: + if not significant: + continue + title = text.strip() + title = title[0].upper() + title[1:] + for sep in ' with ', ' at ', ' wearing ': + if sep in title: + before, _, after = title.partition(sep) + names = [] + for x in title, before, after: + names += utils.also_singular(x) + return names + return utils.also_singular(title) + + def urls_for_wikidata(self): + return [utils.wiki_url(name, self.site, ns='Category') + for name in self.names_for_wikidata()] + + def check(self): + cat = self.title + lc_cat = cat.lower() + by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement', + 'medium', 'year', 'painter'] + + if self.item: + by_endings += self.item.artist_labels() + + for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'): + pos = lc_cat.find(after) + # don't highlight "1512 in art" + if pos == -1 or cat[:pos - 1].isdigit(): + continue + return [(True, cat[:pos]), (False, cat[pos:])] + + for before in ('paintings of', 'portraits of', 'landscapes of', + 'portraits with', 'paintings with', 'paintings depicting', + 'portraits depicting', 'landscapes depicting', 'works about'): + pos = lc_cat.find(before) + if pos == -1: + continue + pos += len(before) + for by_ending in by_endings: + ending = ' by ' + by_ending + if lc_cat.endswith(ending): + return [(False, cat[:pos]), + (True, cat[pos:-len(ending)]), + (False, cat[-len(ending):])] + + return [(False, cat[:pos]), (True, cat[pos:])] + + pos = lc_cat.find('of ') + if pos != -1: + return [(True, cat[:pos]), (False, cat[pos:])] + + return [(False, cat)] diff --git a/depicts/mediawiki.py b/depicts/mediawiki.py index deb99f9..aa3c73a 100644 --- a/depicts/mediawiki.py +++ b/depicts/mediawiki.py @@ -2,6 +2,7 @@ import requests import os import json import hashlib +from .category import Category from . import utils wikidata_url = 'https://www.wikidata.org/w/api.php' @@ -148,6 +149,26 @@ def get_content_and_categories(title, site): def host_from_site(site): return hosts[site] +def process_cats(cats, site): + return [Category(cat['title'], site) for cat in cats] + +def get_categories(titles, site): + params = { + 'prop': 'categories', + 'clshow': '!hidden', + 'cllimit': 'max', + } + from_wiki = mediawiki_query(titles, params, site) + title_and_cats = [] + for i in from_wiki: + if 'categories' not in i: + continue + cats = process_cats(i['categories'], site) + if not cats: + continue + title_and_cats.append((i['title'], cats)) + return title_and_cats + def get_history(title, site): params = { 'prop': 'revisions', diff --git a/depicts/utils.py b/depicts/utils.py index 0064b23..9d4c93d 100644 --- a/depicts/utils.py +++ b/depicts/utils.py @@ -2,6 +2,7 @@ from flask import request from itertools import islice from datetime import datetime import urllib.parse +import inflect hosts = { 'commons': 'commons.wikimedia.org', @@ -9,6 +10,8 @@ hosts = { 'wikidata': 'www.wikidata.org', } +engine = inflect.engine() + skip_names = { 'National Gallery' } @@ -33,6 +36,35 @@ def parse_sitelink(s, start): def word_contains_letter(word): return any(c.isalpha() for c in word) +def also_singular(name): + names = also_singular_main(name) + extra = [] + for n in names: + words = set(n.lower().split()) + for word in 'girl', 'boy': + if word in words: + extra.append(word) + if {'female', 'females', 'women'} & words: + extra.append('woman') + if {'male', 'males', 'men'} & words: + extra.append('man') + return [n for n in names + extra if n not in skip_names] + +def also_singular_main(name): + ''' + given a singular name return a list of both the plural and singular versions + just return the name if it isn't singular + ''' + singular = engine.singular_noun(name.strip('|')) + if not singular: + return [name] + n, s = name.lower(), singular.lower() + if (n == s or + n.replace('paintings', '') == s.replace('painting', '') or + n == 'venus' and s == 'venu'): + return [name] + return [name, singular] + def wiki_url(title, site, ns=None): host = hosts[site] url_ns = ns + ':' if ns else ''