From 97ca0aaa34b111330c4dda833c13a66b1ad5609d Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 16 Jan 2020 14:01:43 +0000 Subject: [PATCH] Remove some unused code. --- depicts/artwork.py | 203 +------------------------------------------ depicts/category.py | 110 ----------------------- depicts/mediawiki.py | 21 ----- depicts/utils.py | 35 +------- 4 files changed, 4 insertions(+), 365 deletions(-) delete mode 100644 depicts/category.py diff --git a/depicts/artwork.py b/depicts/artwork.py index e8f8972..5536570 100644 --- a/depicts/artwork.py +++ b/depicts/artwork.py @@ -1,101 +1,12 @@ -from . import utils, wdqs, mediawiki -# import nltk -import re - -re_from_article = re.compile(r'(?:portrays|depicts|depictions of|it shows) (.+?)\.', re.I) - -ignore_for_depicts = { - 43445, # female organism - use: female (Q6581072) - 44148, # male organism - use: male (Q6581097) - 21075684, # children - use: child (Q7569) - 180788, # National Gallery - 780294, # human physical appearance - 2472587, # people - 33659, # People -} - -query = ''' -select distinct ?item ?itemLabel ?commonscat ?cat_url ?sitelink -where { - service wikibase:label { bd:serviceParam wikibase:language "en" } - filter (?item != wd:QID) - - { - VALUES (?commonscat) { COMMONS_CAT } - ?item wdt:P373 ?commonscat . - filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia category - filter not exists { ?item wdt:P31 wd:Q4167410 } # Wikimedia disambiguation page - filter not exists { ?item wdt:P31 wd:Q24046192 } # Wikimedia category of stubs - filter not exists { ?item wdt:P31 wd:Q4167836 } # Wikimedia list article - filter not exists { ?item wdt:P31 wd:Q4663903 } # Wikimedia portal - } union { - VALUES (?commonscat) { COMMONS_CAT } - ?cat_item wdt:P373 ?commonscat . - ?cat_item wdt:P301 ?item . - } union { - VALUES (?cat_url) { CAT_URL } - ?cat_url schema:about ?cat_item . - ?cat_item wdt:P301 ?item . - } union { - VALUES (?sitelink) { SITELINK } - ?sitelink schema:about ?item . - filter not exists { ?item wdt:P31 wd:Q4167410 } - } -}''' - -class QueryResultRow: - def __init__(self, row): - self.row = {k: (v if k.startswith('item') else [v]) for k, v in row.items()} - self.item_id = wdqs.row_id(row) - self.label = wdqs.get_row_value(row, 'itemLabel') - - def update(self, row): - for key, value in row.items(): - if key.startswith('item'): - continue - self.row.setdefault(key, []).append(value) - - @property - def url(self): - return self.row['item']['value'] - - @property - def qid(self): - return f'Q{self.item_id}' - - def sources(self): - return {k: v for k, v in self.row.items() if not k.startswith('item')} - - def sources_list(self): - - def get_value(i): - if i['type'] != 'uri': - return i['value'] - wiki_start = i['value'].rfind('/wiki/') - return i['value'][wiki_start + 6:] - - return [(k, [get_value(i) for i in v]) - for k, v in self.row.items() - if not k.startswith('item')] +from . import mediawiki class Artwork: def __init__(self, qid): self.entity = mediawiki.get_entity_with_cache(qid) self.item_id = int(qid[1:]) - if self.enwiki: - content, cats = mediawiki.get_content_and_categories(self.enwiki, 'enwiki') - self.enwiki_content = content - self.enwiki_categories = mediawiki.process_cats(cats, 'enwiki') - for cat in self.enwiki_categories: - cat.set_item(self) - else: - self.enwiki_content = None - self.enwiki_categories = None - sites = ['commons', 'enwiki'] self.parent_categories = {site: {} for site in sites} - self.categories = self.get_categories() @property @@ -194,115 +105,3 @@ class Artwork: @property def enwiki(self): return self.sitelinks['enwiki']['title'] if 'enwiki' in self.sitelinks else None - - def get_categories(self): - titles = {'File:' + filename for filename in self.commons_filenames} - for commons_cat in self.commons_cats: - titles.add('Category:' + commons_cat) - if self.commons_sitelink: - titles.add(self.commons_sitelink) - if not titles: - return [] - - # cat_list = mediawiki.get_categories(titles, 'commons') - cat_list = [] - - for title, cats in cat_list: - for cat in cats: - cat.set_item(self) - if not title.startswith('Category:'): - continue - self.parent_categories['commons'][utils.drop_category_ns(title)] = cats - - get_more_cats = [] - for _, cats in self.parent_categories['commons'].items(): - for cat in cats: - if cat.title not in self.parent_categories: - get_more_cats.append('Category:' + cat.title) - - for title, cats in mediawiki.get_categories(get_more_cats, 'commons'): - for cat in cats: - cat.set_item(self) - self.parent_categories['commons'][utils.drop_category_ns(title)] = cats - - if self.enwiki: - cat_list.append((self.enwiki, self.enwiki_categories)) - - get_more_cats = ['Category:' + cat.title for cat in self.enwiki_categories] - for title, cats in mediawiki.get_categories(get_more_cats, 'enwiki'): - self.parent_categories['enwiki'][utils.drop_category_ns(title)] = cats - - return cat_list - - def depicts_from_enwiki_content(self): - if not self.enwiki_url: - return - for par in self.enwiki_content.split('\n\n'): - m = re_from_article.search(par) - if m: - return m.group(1) - - def query_variables(self): - commons_cat = [] - cat_url = [] - keywords = [] - for _, categories in self.categories: - for cat in categories: - names = cat.names_for_wikidata() - keywords += names - if cat.site == 'commons': - commons_cat += names - cat_url += cat.urls_for_wikidata() - - text = self.depicts_from_enwiki_content() - if text: - sentences = nltk.sent_tokenize(text) - - for sentence in sentences: - for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): - if not utils.word_contains_letter(word): - continue - if not pos.startswith('NN'): - continue - word = word.strip('|') - for k in word.strip('|').split('|'): - if utils.word_contains_letter(k): - keywords += utils.also_singular(k) - - keywords = [k for k in keywords if utils.word_contains_letter(k)] - - return { - 'commons_cat': commons_cat, - 'cat_url': cat_url, - 'keywords': keywords, - } - - def build_query(self): - query_vars = self.query_variables() - sitelinks = [utils.wiki_url(title, 'enwiki') for title in query_vars['keywords']] - sitelinks = [url for url in sitelinks if url] - - q = query.replace('COMMONS_CAT', wdqs.quote_list(query_vars['commons_cat'])) - q = q.replace('CAT_URL', wdqs.url_list(query_vars['cat_url'])) - q = q.replace('QID', self.qid) - q = q.replace('SITELINK', wdqs.url_list(sitelinks)) - return q - - def run_query(self): - query = self.build_query() - - rows = wdqs.run_query_with_cache(query) - by_id = {} - results = [] - for row in rows: - item_id = wdqs.row_id(row) - if item_id in ignore_for_depicts: - continue - if item_id in by_id: - by_id[item_id].update(row) - continue - hit = QueryResultRow(row) - by_id[item_id] = hit - results.append(hit) - - return sorted(results, key=lambda hit: hit.item_id) diff --git a/depicts/category.py b/depicts/category.py deleted file mode 100644 index 2d350e0..0000000 --- a/depicts/category.py +++ /dev/null @@ -1,110 +0,0 @@ -from . import utils -import re -import calendar - -month_pattern = '|'.join(m for m in calendar.month_name if m) -re_date_based = re.compile(r'^(\d{4}-\d{2}-\d{2}|(' + month_pattern + r') \d{4}|\d{4}s?|\d{1,2}(st|nd|rd|th)-century) ') - -ns_cat = 'Category:' - -class Category: - def __init__(self, title, site): - if title.startswith(ns_cat): - title = title[len(ns_cat):] - self.title = title - self.site = site - self.item = None - - def __repr__(self): - return f'{self.__class__.__name__}({self.title!r}, {self.site!r})' - - def set_item(self, item): - self.item = item - - @property - def url(self): - return utils.wiki_url(self.title, self.site, ns='Category') - - def date_based(self): - return bool(re_date_based.match(self.title)) - - def contains_artist_name(self): - if not self.item: - return - return any(artist.lower() in self.title.lower() - for artist in self.item.artist_labels()) - - def parents(self): - if not self.item: - return [] - return self.item.parent_categories[self.site].get(self.title, []) - - def is_exhibition(self): - return any(parent.title.startswith('Art exhibitions ') - for parent in self.parents()) - - def names_for_wikidata(self): - highlight = self.check() - interesting = len(highlight) > 1 - - if not interesting: - if self.date_based() or self.contains_artist_name() or self.is_exhibition(): - return [] - - return utils.also_singular(self.title) - - for significant, text in highlight: - if not significant: - continue - title = text.strip() - title = title[0].upper() + title[1:] - for sep in ' with ', ' at ', ' wearing ': - if sep in title: - before, _, after = title.partition(sep) - names = [] - for x in title, before, after: - names += utils.also_singular(x) - return names - return utils.also_singular(title) - - def urls_for_wikidata(self): - return [utils.wiki_url(name, self.site, ns='Category') - for name in self.names_for_wikidata()] - - def check(self): - cat = self.title - lc_cat = cat.lower() - by_endings = ['title', 'technique', 'period', 'century', 'country', 'movement', - 'medium', 'year', 'painter'] - - if self.item: - by_endings += self.item.artist_labels() - - for after in ('in art', 'in portrait paintings', 'in landscape paintings', 'in culture', 'in popular culture', 'in painting', 'in 1', 'in 2', 'looking at viewer'): - pos = lc_cat.find(after) - # don't highlight "1512 in art" - if pos == -1 or cat[:pos - 1].isdigit(): - continue - return [(True, cat[:pos]), (False, cat[pos:])] - - for before in ('paintings of', 'portraits of', 'landscapes of', - 'portraits with', 'paintings with', 'paintings depicting', - 'portraits depicting', 'landscapes depicting', 'works about'): - pos = lc_cat.find(before) - if pos == -1: - continue - pos += len(before) - for by_ending in by_endings: - ending = ' by ' + by_ending - if lc_cat.endswith(ending): - return [(False, cat[:pos]), - (True, cat[pos:-len(ending)]), - (False, cat[-len(ending):])] - - return [(False, cat[:pos]), (True, cat[pos:])] - - pos = lc_cat.find('of ') - if pos != -1: - return [(True, cat[:pos]), (False, cat[pos:])] - - return [(False, cat)] diff --git a/depicts/mediawiki.py b/depicts/mediawiki.py index e0d7f9e..d01b669 100644 --- a/depicts/mediawiki.py +++ b/depicts/mediawiki.py @@ -2,7 +2,6 @@ import requests import os import json import hashlib -from .category import Category from . import utils wikidata_url = 'https://www.wikidata.org/w/api.php' @@ -145,26 +144,6 @@ def get_content_and_categories(title, site): def host_from_site(site): return hosts[site] -def process_cats(cats, site): - return [Category(cat['title'], site) for cat in cats] - -def get_categories(titles, site): - params = { - 'prop': 'categories', - 'clshow': '!hidden', - 'cllimit': 'max', - } - from_wiki = mediawiki_query(titles, params, site) - title_and_cats = [] - for i in from_wiki: - if 'categories' not in i: - continue - cats = process_cats(i['categories'], site) - if not cats: - continue - title_and_cats.append((i['title'], cats)) - return title_and_cats - def get_history(title, site): params = { 'prop': 'revisions', diff --git a/depicts/utils.py b/depicts/utils.py index 7c83f1e..0064b23 100644 --- a/depicts/utils.py +++ b/depicts/utils.py @@ -2,7 +2,6 @@ from flask import request from itertools import islice from datetime import datetime import urllib.parse -import inflect hosts = { 'commons': 'commons.wikimedia.org', @@ -10,8 +9,6 @@ hosts = { 'wikidata': 'www.wikidata.org', } -engine = inflect.engine() - skip_names = { 'National Gallery' } @@ -30,38 +27,12 @@ def drop_start(s, start): def drop_category_ns(s): return drop_start(s, 'Category:') +def parse_sitelink(s, start): + return urllib.parse.unquote(drop_start(s, start)).replace('_', ' ') + def word_contains_letter(word): return any(c.isalpha() for c in word) -def also_singular(name): - names = also_singular_main(name) - extra = [] - for n in names: - words = set(n.lower().split()) - for word in 'girl', 'boy': - if word in words: - extra.append(word) - if {'female', 'females', 'women'} & words: - extra.append('woman') - if {'male', 'males', 'men'} & words: - extra.append('man') - return [n for n in names + extra if n not in skip_names] - -def also_singular_main(name): - ''' - given a singular name return a list of both the plural and singular versions - just return the name if it isn't singular - ''' - singular = engine.singular_noun(name.strip('|')) - if not singular: - return [name] - n, s = name.lower(), singular.lower() - if (n == s or - n.replace('paintings', '') == s.replace('painting', '') or - n == 'venus' and s == 'venu'): - return [name] - return [name, singular] - def wiki_url(title, site, ns=None): host = hosts[site] url_ns = ns + ':' if ns else ''