Refactor

2020-06-30 09:01:57 +01:00 · 2020-06-30 09:01:57 +01:00 · dc9989157c
commit dc9989157c
parent 9299909e2f
4 changed files with 163 additions and 1 deletions
--- a/depicts/artwork.py
+++ b/depicts/artwork.py
@ -7,7 +7,6 @@ class Artwork:

        sites = ['commons', 'enwiki']
        self.parent_categories = {site: {} for site in sites}
-        self.categories = self.get_categories()

    @property
    def image_filename(self):
--- a/depicts/category.py
+++ b/depicts/category.py
@ -0,0 +1,110 @@
+from . import utils
+import re
+import calendar
+
+month_pattern = '|'.join(m for m in calendar.month_name if m)
+re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
+
+ns_cat = 'Category:'
+
+class Category:
+    def __init__(self, title, site):
+        if title.startswith(ns_cat):
+            title = title[len(ns_cat):]
+        self.title = title
+        self.site = site
+        self.item = None
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
+
+    def set_item(self, item):
+        self.item = item
+
+    @property
+    def url(self):
+        return utils.wiki_url(self.title, self.site, ns='Category')
+
+    def date_based(self):
+        return bool(re_date_based.match(self.title))
+
+    def contains_artist_name(self):
+        if not self.item:
+            return
+        return any(artist.lower() in self.title.lower()
+                   for artist in self.item.artist_labels())
+
+    def parents(self):
+        if not self.item:
+            return []
+        return self.item.parent_categories[self.site].get(self.title, [])
+
+    def is_exhibition(self):
+        return any(parent.title.startswith('Art exhibitions ')
+                   for parent in self.parents())
+
+    def names_for_wikidata(self):
+        highlight = self.check()
+        interesting = len(highlight) > 1
+
+        if not interesting:
+            if self.date_based() or self.contains_artist_name() or self.is_exhibition():
+                return []
+
+            return utils.also_singular(self.title)
+
+        for significant, text in highlight:
+            if not significant:
+                continue
+            title = text.strip()
+            title = title[0].upper() + title[1:]
+            for sep in ' with ', ' at ', ' wearing ':
+                if sep in title:
+                    before, _, after = title.partition(sep)
+                    names = []
+                    for x in title, before, after:
+                        names += utils.also_singular(x)
+                    return names
+            return utils.also_singular(title)
+
+    def urls_for_wikidata(self):
+        return [utils.wiki_url(name, self.site, ns='Category')
+                for name in self.names_for_wikidata()]
+
+    def check(self):
+        cat = self.title
+        lc_cat = cat.lower()
+        by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
+                      'medium', 'year', 'painter']
+
+        if self.item:
+            by_endings += self.item.artist_labels()
+
+        for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
+            pos = lc_cat.find(after)
+            # don't highlight "1512 in art"
+            if pos == -1 or cat[:pos - 1].isdigit():
+                continue
+            return [(True, cat[:pos]), (False, cat[pos:])]
+
+        for before in ('paintings of', 'portraits of', 'landscapes of',
+                       'portraits with', 'paintings with', 'paintings depicting',
+                       'portraits depicting', 'landscapes depicting', 'works about'):
+            pos = lc_cat.find(before)
+            if pos == -1:
+                continue
+            pos += len(before)
+            for by_ending in by_endings:
+                ending = ' by ' + by_ending
+                if lc_cat.endswith(ending):
+                    return [(False, cat[:pos]),
+                            (True, cat[pos:-len(ending)]),
+                            (False, cat[-len(ending):])]
+
+            return [(False, cat[:pos]), (True, cat[pos:])]
+
+        pos = lc_cat.find('of ')
+        if pos != -1:
+            return [(True, cat[:pos]), (False, cat[pos:])]
+
+        return [(False, cat)]
--- a/depicts/mediawiki.py
+++ b/depicts/mediawiki.py
@ -2,6 +2,7 @@ import requests
 import os
 import json
 import hashlib
+from .category import Category
 from . import utils

 wikidata_url = 'https://www.wikidata.org/w/api.php'
@ -148,6 +149,26 @@ def get_content_and_categories(title, site):
 def host_from_site(site):
    return hosts[site]

+def process_cats(cats, site):
+    return [Category(cat['title'], site) for cat in cats]
+
+def get_categories(titles, site):
+    params = {
+        'prop': 'categories',
+        'clshow': '!hidden',
+        'cllimit': 'max',
+    }
+    from_wiki = mediawiki_query(titles, params, site)
+    title_and_cats = []
+    for i in from_wiki:
+        if 'categories' not in i:
+            continue
+        cats = process_cats(i['categories'], site)
+        if not cats:
+            continue
+        title_and_cats.append((i['title'], cats))
+    return title_and_cats
+
 def get_history(title, site):
    params = {
        'prop': 'revisions',
--- a/depicts/utils.py
+++ b/depicts/utils.py
@ -2,6 +2,7 @@ from flask import request
 from itertools import islice
 from datetime import datetime
 import urllib.parse
+import inflect

 hosts = {
    'commons': 'commons.wikimedia.org',
@ -9,6 +10,8 @@ hosts = {
    'wikidata': 'www.wikidata.org',
 }

+engine = inflect.engine()
+
 skip_names = {
    'National Gallery'
 }
@ -33,6 +36,35 @@ def parse_sitelink(s, start):
 def word_contains_letter(word):
    return any(c.isalpha() for c in word)

+def also_singular(name):
+    names = also_singular_main(name)
+    extra = []
+    for n in names:
+        words = set(n.lower().split())
+        for word in 'girl', 'boy':
+            if word in words:
+                extra.append(word)
+        if {'female', 'females', 'women'} & words:
+            extra.append('woman')
+        if {'male', 'males', 'men'} & words:
+            extra.append('man')
+    return [n for n in names + extra if n not in skip_names]
+
+def also_singular_main(name):
+    '''
+    given a singular name return a list of both the plural and singular versions
+    just return the name if it isn't singular
+    '''
+    singular = engine.singular_noun(name.strip('|'))
+    if not singular:
+        return [name]
+    n, s = name.lower(), singular.lower()
+    if (n == s or
+            n.replace('paintings', '') == s.replace('painting', '') or
+            n == 'venus' and s == 'venu'):
+        return [name]
+    return [name, singular]
+
 def wiki_url(title, site, ns=None):
    host = hosts[site]
    url_ns = ns + ':' if ns else ''