Remove some unused code.

2020-01-16 14:01:43 +00:00 · 2020-01-16 14:01:43 +00:00 · 97ca0aaa34
parent 93df781115
commit 97ca0aaa34
4 changed files with 4 additions and 365 deletions
--- a/depicts/artwork.py
+++ b/depicts/artwork.py
@ -1,101 +1,12 @@
-from . import utils, wdqs, mediawiki
-# import nltk
-import re
-
-re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I)
-
-ignore_for_depicts = {
-    43445,     # female organism - use: female (Q6581072)
-    44148,     # male organism   - use: male (Q6581097)
-    21075684,  # children        - use: child (Q7569)
-    180788,    # National Gallery
-    780294,    # human physical appearance
-    2472587,   # people
-    33659,     # People
-}
-
-query = '''
-select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink
-where {
-  service wikibase:label { bd:serviceParam wikibase:language "en" }
-  filter (?item != wd:QID)
-
-  {
-    VALUES (?commonscat) { COMMONS_CAT }
-    ?item wdt:P373 ?commonscat .
-    filter not exists { ?item wdt:P31 wd:Q4167836 }  # Wikimedia category
-    filter not exists { ?item wdt:P31 wd:Q4167410 }  # Wikimedia disambiguation page
-    filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs
-    filter not exists { ?item wdt:P31 wd:Q4167836 }  # Wikimedia list article
-    filter not exists { ?item wdt:P31 wd:Q4663903 }  # Wikimedia portal
-  } union {
-    VALUES (?commonscat) { COMMONS_CAT }
-    ?cat_item wdt:P373 ?commonscat .
-    ?cat_item wdt:P301 ?item .
-  } union {
-    VALUES (?cat_url) { CAT_URL }
-    ?cat_url schema:about ?cat_item .
-    ?cat_item wdt:P301 ?item .
-  } union {
-    VALUES (?sitelink) { SITELINK }
-    ?sitelink schema:about ?item .
-    filter not exists { ?item wdt:P31 wd:Q4167410 }
-  }
-}'''
-
-class QueryResultRow:
-    def __init__(self, row):
-        self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()}
-        self.item_id = wdqs.row_id(row)
-        self.label = wdqs.get_row_value(row, 'itemLabel')
-
-    def update(self, row):
-        for key, value in row.items():
-            if key.startswith('item'):
-                continue
-            self.row.setdefault(key, []).append(value)
-
-    @property
-    def url(self):
-        return self.row['item']['value']
-
-    @property
-    def qid(self):
-        return f'Q{self.item_id}'
-
-    def sources(self):
-        return {k: v for k, v in self.row.items() if not k.startswith('item')}
-
-    def sources_list(self):
-
-        def get_value(i):
-            if i['type'] != 'uri':
-                return i['value']
-            wiki_start = i['value'].rfind('/wiki/')
-            return i['value'][wiki_start + 6:]
-
-        return [(k, [get_value(i) for i in v])
-                for k, v in self.row.items()
-                if not k.startswith('item')]
+from . import mediawiki

 class Artwork:
    def __init__(self, qid):
        self.entity = mediawiki.get_entity_with_cache(qid)
        self.item_id = int(qid[1:])

-        if self.enwiki:
-            content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki')
-            self.enwiki_content = content
-            self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki')
-            for cat in self.enwiki_categories:
-                cat.set_item(self)
-        else:
-            self.enwiki_content = None
-            self.enwiki_categories = None
-
        sites = ['commons', 'enwiki']
        self.parent_categories = {site: {} for site in sites}
-
        self.categories = self.get_categories()

    @property
@ -194,115 +105,3 @@ class Artwork:
    @property
    def enwiki(self):
        return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None
-
-    def get_categories(self):
-        titles = {'File:' + filename for filename in self.commons_filenames}
-        for commons_cat in self.commons_cats:
-            titles.add('Category:' + commons_cat)
-        if self.commons_sitelink:
-            titles.add(self.commons_sitelink)
-        if not titles:
-            return []
-
-        # cat_list = mediawiki.get_categories(titles, 'commons')
-        cat_list = []
-
-        for title, cats in cat_list:
-            for cat in cats:
-                cat.set_item(self)
-            if not title.startswith('Category:'):
-                continue
-            self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
-
-        get_more_cats = []
-        for _, cats in self.parent_categories['commons'].items():
-            for cat in cats:
-                if cat.title not in self.parent_categories:
-                    get_more_cats.append('Category:' + cat.title)
-
-        for title, cats in mediawiki.get_categories(get_more_cats, 'commons'):
-            for cat in cats:
-                cat.set_item(self)
-            self.parent_categories['commons'][utils.drop_category_ns(title)] = cats
-
-        if self.enwiki:
-            cat_list.append((self.enwiki, self.enwiki_categories))
-
-            get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories]
-            for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'):
-                self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats
-
-        return cat_list
-
-    def depicts_from_enwiki_content(self):
-        if not self.enwiki_url:
-            return
-        for par in self.enwiki_content.split('\n\n'):
-            m = re_from_article.search(par)
-            if m:
-                return m.group(1)
-
-    def query_variables(self):
-        commons_cat = []
-        cat_url = []
-        keywords = []
-        for _, categories in self.categories:
-            for cat in categories:
-                names = cat.names_for_wikidata()
-                keywords += names
-                if cat.site == 'commons':
-                    commons_cat += names
-                cat_url += cat.urls_for_wikidata()
-
-        text = self.depicts_from_enwiki_content()
-        if text:
-            sentences = nltk.sent_tokenize(text)
-
-            for sentence in sentences:
-                for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
-                    if not utils.word_contains_letter(word):
-                        continue
-                    if not pos.startswith('NN'):
-                        continue
-                    word = word.strip('|')
-                    for k in word.strip('|').split('|'):
-                        if utils.word_contains_letter(k):
-                            keywords += utils.also_singular(k)
-
-        keywords = [k for k in keywords if utils.word_contains_letter(k)]
-
-        return {
-            'commons_cat': commons_cat,
-            'cat_url': cat_url,
-            'keywords': keywords,
-        }
-
-    def build_query(self):
-        query_vars = self.query_variables()
-        sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']]
-        sitelinks = [url for url in sitelinks if url]
-
-        q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat']))
-        q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url']))
-        q = q.replace('QID', self.qid)
-        q = q.replace('SITELINK', wdqs.url_list(sitelinks))
-        return q
-
-    def run_query(self):
-        query = self.build_query()
-
-        rows = wdqs.run_query_with_cache(query)
-        by_id = {}
-        results = []
-        for row in rows:
-            item_id = wdqs.row_id(row)
-            if item_id in ignore_for_depicts:
-                continue
-            if item_id in by_id:
-                by_id[item_id].update(row)
-                continue
-            hit = QueryResultRow(row)
-            by_id[item_id] = hit
-            results.append(hit)
-
-        return sorted(results, key=lambda hit: hit.item_id)
--- a/depicts/category.py
+++ b/depicts/category.py
@ -1,110 +0,0 @@
-from . import utils
-import re
-import calendar
-
-month_pattern = '|'.join(m for m in calendar.month_name if m)
-re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ')
-
-ns_cat = 'Category:'
-
-class Category:
-    def __init__(self, title, site):
-        if title.startswith(ns_cat):
-            title = title[len(ns_cat):]
-        self.title = title
-        self.site = site
-        self.item = None
-
-    def __repr__(self):
-        return f'{self.__class__.__name__}({self.title!r}, {self.site!r})'
-
-    def set_item(self, item):
-        self.item = item
-
-    @property
-    def url(self):
-        return utils.wiki_url(self.title, self.site, ns='Category')
-
-    def date_based(self):
-        return bool(re_date_based.match(self.title))
-
-    def contains_artist_name(self):
-        if not self.item:
-            return
-        return any(artist.lower() in self.title.lower()
-                   for artist in self.item.artist_labels())
-
-    def parents(self):
-        if not self.item:
-            return []
-        return self.item.parent_categories[self.site].get(self.title, [])
-
-    def is_exhibition(self):
-        return any(parent.title.startswith('Art exhibitions ')
-                   for parent in self.parents())
-
-    def names_for_wikidata(self):
-        highlight = self.check()
-        interesting = len(highlight) > 1
-
-        if not interesting:
-            if self.date_based() or self.contains_artist_name() or self.is_exhibition():
-                return []
-
-            return utils.also_singular(self.title)
-
-        for significant, text in highlight:
-            if not significant:
-                continue
-            title = text.strip()
-            title = title[0].upper() + title[1:]
-            for sep in ' with ', ' at ', ' wearing ':
-                if sep in title:
-                    before, _, after = title.partition(sep)
-                    names = []
-                    for x in title, before, after:
-                        names += utils.also_singular(x)
-                    return names
-            return utils.also_singular(title)
-
-    def urls_for_wikidata(self):
-        return [utils.wiki_url(name, self.site, ns='Category')
-                for name in self.names_for_wikidata()]
-
-    def check(self):
-        cat = self.title
-        lc_cat = cat.lower()
-        by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement',
-                      'medium', 'year', 'painter']
-
-        if self.item:
-            by_endings += self.item.artist_labels()
-
-        for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'):
-            pos = lc_cat.find(after)
-            # don't highlight "1512 in art"
-            if pos == -1 or cat[:pos - 1].isdigit():
-                continue
-            return [(True, cat[:pos]), (False, cat[pos:])]
-
-        for before in ('paintings of', 'portraits of', 'landscapes of',
-                       'portraits with', 'paintings with', 'paintings depicting',
-                       'portraits depicting', 'landscapes depicting', 'works about'):
-            pos = lc_cat.find(before)
-            if pos == -1:
-                continue
-            pos += len(before)
-            for by_ending in by_endings:
-                ending = ' by ' + by_ending
-                if lc_cat.endswith(ending):
-                    return [(False, cat[:pos]),
-                            (True, cat[pos:-len(ending)]),
-                            (False, cat[-len(ending):])]
-
-            return [(False, cat[:pos]), (True, cat[pos:])]
-
-        pos = lc_cat.find('of ')
-        if pos != -1:
-            return [(True, cat[:pos]), (False, cat[pos:])]
-
-        return [(False, cat)]
--- a/depicts/mediawiki.py
+++ b/depicts/mediawiki.py
@ -2,7 +2,6 @@ import requests
 import os
 import json
 import hashlib
-from .category import Category
 from . import utils

 wikidata_url = 'https://www.wikidata.org/w/api.php'
@ -145,26 +144,6 @@ def get_content_and_categories(title, site):
 def host_from_site(site):
    return hosts[site]

-def process_cats(cats, site):
-    return [Category(cat['title'], site) for cat in cats]
-
-def get_categories(titles, site):
-    params = {
-        'prop': 'categories',
-        'clshow': '!hidden',
-        'cllimit': 'max',
-    }
-    from_wiki = mediawiki_query(titles, params, site)
-    title_and_cats = []
-    for i in from_wiki:
-        if 'categories' not in i:
-            continue
-        cats = process_cats(i['categories'], site)
-        if not cats:
-            continue
-        title_and_cats.append((i['title'], cats))
-    return title_and_cats
-
 def get_history(title, site):
    params = {
        'prop': 'revisions',
--- a/depicts/utils.py
+++ b/depicts/utils.py
@ -2,7 +2,6 @@ from flask import request
 from itertools import islice
 from datetime import datetime
 import urllib.parse
-import inflect

 hosts = {
    'commons': 'commons.wikimedia.org',
@ -10,8 +9,6 @@ hosts = {
    'wikidata': 'www.wikidata.org',
 }

-engine = inflect.engine()
-
 skip_names = {
    'National Gallery'
 }
@ -30,38 +27,12 @@ def drop_start(s, start):
 def drop_category_ns(s):
    return drop_start(s, 'Category:')

+def parse_sitelink(s, start):
+    return urllib.parse.unquote(drop_start(s, start)).replace('_', ' ')
+
 def word_contains_letter(word):
    return any(c.isalpha() for c in word)

-def also_singular(name):
-    names = also_singular_main(name)
-    extra = []
-    for n in names:
-        words = set(n.lower().split())
-        for word in 'girl', 'boy':
-            if word in words:
-                extra.append(word)
-        if {'female', 'females', 'women'} & words:
-            extra.append('woman')
-        if {'male', 'males', 'men'} & words:
-            extra.append('man')
-    return [n for n in names + extra if n not in skip_names]
-
-def also_singular_main(name):
-    '''
-    given a singular name return a list of both the plural and singular versions
-    just return the name if it isn't singular
-    '''
-    singular = engine.singular_noun(name.strip('|'))
-    if not singular:
-        return [name]
-    n, s = name.lower(), singular.lower()
-    if (n == s or
-            n.replace('paintings', '') == s.replace('painting', '') or
-            n == 'venus' and s == 'venu'):
-        return [name]
-    return [name, singular]
-
 def wiki_url(title, site, ns=None):
    host = hosts[site]
    url_ns = ns + ':' if ns else ''