From d1ac75583bb0e76c49ecc0fa383179d809606477 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 10 Oct 2019 10:58:42 +0100 Subject: [PATCH] Refactor: move catalog code again --- app.py | 85 ++++------------------ depicts/parse_catalog.py | 71 ------------------ depicts/wd_catalog.py | 151 +++++++++++++++++++++++++++++++++++++++ templates/item.html | 10 +-- 4 files changed, 169 insertions(+), 148 deletions(-) delete mode 100644 depicts/parse_catalog.py diff --git a/app.py b/app.py index f19fdff..edf2d3e 100755 --- a/app.py +++ b/app.py @@ -1,10 +1,8 @@ #!/usr/bin/python3 from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session -from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database, - dia, rijksmuseum, npg, museodelprado, barnesfoundation, - wd_catalog, human, wikibase, wikidata_oauth, parse_catalog, - wikidata_edit) +from depicts import (utils, wdqs, commons, mediawiki, painting, database, + wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit) from depicts.pager import Pagination, init_pager from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem, Language) @@ -14,8 +12,6 @@ from werkzeug.exceptions import InternalServerError from werkzeug.debug.tbtools import get_current_traceback from sqlalchemy import func, distinct from collections import defaultdict -import requests.exceptions -import requests import json import os import locale @@ -208,7 +204,6 @@ def random_painting(): @app.route('/oauth/start') def start_oauth(): - next_page = request.args.get('next') if next_page: session['after_login'] = next_page @@ -319,6 +314,14 @@ def existing_depicts_from_entity(entity): existing.append(d) return existing +def get_institution(entity, other): + if 'P276' in entity['claims']: + location = wikibase.first_datavalue(entity, 'P276')['id'] + return other[location] + elif 'P195' in entity['claims']: + collection = wikibase.first_datavalue(entity, 'P195')['id'] + return other[collection] + @app.route("/item/Q") def item_page(item_id): qid = f'Q{item_id}' @@ -342,74 +345,14 @@ def item_page(item_id): people = human.from_name(label) if label else None - if 'P276' in entity['claims']: - location = wikibase.first_datavalue(entity, 'P276')['id'] - institution = other[location] - elif 'P195' in entity['claims']: - collection = wikibase.first_datavalue(entity, 'P195')['id'] - institution = other[collection] - else: - institution = '???' - painting_item = PaintingItem.query.get(item_id) if painting_item is None: painting_item = PaintingItem(item_id=item_id, label=label, entity=entity) database.session.add(painting_item) - catalog_ids = wd_catalog.find_catalog_id(entity) - catalog_detail = [] - for property_id in sorted(catalog_ids): - value = wikibase.first_datavalue(entity, property_id) - detail = wd_catalog.lookup(property_id, value) - catalog_detail.append(detail) - - catalog_url = wikibase.first_datavalue(entity, 'P973') - - catalog = None - try: - if 'P4704' in entity['claims']: - saam_id = wikibase.first_datavalue(entity, 'P4704') - catalog = saam.get_catalog(saam_id) - elif 'P4709' in entity['claims']: - catalog_id = wikibase.first_datavalue(entity, 'P4709') - catalog = barnesfoundation.get_catalog(catalog_id) - elif catalog_url and 'www.dia.org' in catalog_url: - catalog = dia.get_catalog(catalog_url) - elif catalog_url and 'www.rijksmuseum.nl' in catalog_url: - catalog = rijksmuseum.get_catalog(catalog_url) - elif catalog_url and 'www.npg.org.uk' in catalog_url: - catalog = npg.get_catalog(catalog_url) - elif catalog_url and 'www.museodelprado.es' in catalog_url: - catalog = museodelprado.get_catalog(catalog_url) - - if not catalog and catalog_url: - html = parse_catalog.get_catalog_url(catalog_url) - description = parse_catalog.get_description_from_page(html) - if description: - catalog = { - 'institution': institution, - 'description': description, - } - - if not catalog and catalog_ids: - for property_id in sorted(catalog_ids): - if property_id == 'P350': - continue # RKDimages ID - value = wikibase.first_datavalue(entity, property_id) - detail = wd_catalog.lookup(property_id, value) - try: - html = parse_catalog.get_catalog_page(property_id, value) - except (requests.exceptions.ConnectionError, requests.exceptions.SSLError): - continue # ignore this error - description = parse_catalog.get_description_from_page(html) - if not description: - continue - catalog = { - 'institution': detail['label'], - 'description': description, - } - except requests.exceptions.ReadTimeout: - pass + catalog = wd_catalog.get_catalog_from_painting(entity) + if not catalog.get('institution'): + catalog['institution'] = get_institution(entity, other) label_languages = label_and_language['languages'] if label_and_language else [] show_translation_links = all(lang.code != 'en' for lang in label_languages) @@ -418,8 +361,6 @@ def item_page(item_id): item_id=item_id, item=item, catalog=catalog, - catalog_url=catalog_url, - catalog_detail=catalog_detail, labels=find_more_props, entity=item.entity, username=wikidata_oauth.get_username(), diff --git a/depicts/parse_catalog.py b/depicts/parse_catalog.py deleted file mode 100644 index 2dc3d22..0000000 --- a/depicts/parse_catalog.py +++ /dev/null @@ -1,71 +0,0 @@ -from depicts import wd_catalog, relaxed_ssl - -import lxml.html -import os.path -import requests -import hashlib - -user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0' - -def get_description_from_page(html): - root = lxml.html.fromstring(html) - div = root.find('.//div[@itemprop="description"]') - if div is not None: - return div.text - - div_list = root.find_class('item-description') - if len(div_list): - return div_list[0].text_content() - - meta_twitter_description = root.find('.//meta[@name="twitter:description"]') - if meta_twitter_description is None: - return - twitter_description = meta_twitter_description.get('content') - if not twitter_description: - return - twitter_description = twitter_description.strip() - - if not twitter_description: - return - - for element in root.getiterator(): - if not element.text: - continue - text = element.text.strip() - if not text: - continue - if text != twitter_description and text.startswith(twitter_description): - return text - - return twitter_description - -def get_catalog_page(property_id, value): - detail = wd_catalog.lookup(property_id, value) - url = detail['url'] - catalog_id = value.replace('/', '_') - - filename = f'cache/{property_id}_{catalog_id}.html' - - if os.path.exists(filename): - html = open(filename, 'rb').read() - else: - r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2) - html = r.content - open(filename, 'wb').write(html) - - return html - -def get_catalog_url(url): - md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html' - filename = 'cache/' + md5_filename - - if os.path.exists(filename): - html = open(filename, 'rb').read() - else: - r = relaxed_ssl.get(url, - headers={'User-Agent': user_agent}, - timeout=2) - html = r.content - open(filename, 'wb').write(html) - - return html diff --git a/depicts/wd_catalog.py b/depicts/wd_catalog.py index 9b13089..1e2dbc4 100644 --- a/depicts/wd_catalog.py +++ b/depicts/wd_catalog.py @@ -1,3 +1,13 @@ +from depicts import (wikibase, relaxed_ssl, saam, dia, rijksmuseum, npg, + museodelprado, barnesfoundation) +import requests +import requests.exceptions +import lxml.html +import os.path +import hashlib + +user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0' + table = { 'P347': ('Joconde ID', 'https://www.pop.culture.gouv.fr/notice/joconde/$1'), 'P350': ('RKDimages ID', 'https://rkd.nl/explore/images/$1'), @@ -108,3 +118,144 @@ def lookup(property_id, value): def find_catalog_id(entity): return table.keys() & entity['claims'].keys() + +def check_catalog(entity, catalog): + catalog_url = catalog['url'] + catalog_ids = catalog['ids'] + + if 'P4704' in entity['claims']: + saam_id = wikibase.first_datavalue(entity, 'P4704') + cat = saam.get_catalog(saam_id) + if cat: + catalog.update(cat) + return + + if 'P4709' in entity['claims']: + catalog_id = wikibase.first_datavalue(entity, 'P4709') + cat = barnesfoundation.get_catalog(catalog_id) + if cat: + catalog.update(cat) + return + + institutions = [ + ('www.dia.org', dia), + ('www.rijksmuseum.nl', rijksmuseum), + ('www.npg.org.uk', npg), + ('www.museodelprado.es', museodelprado), + ] + + if catalog_url: + for host, module in institutions: + if host in catalog_url: + cat = module.get_catalog(catalog_url) + if not cat: + continue + catalog.update(cat) + return + + html = get_catalog_url(catalog_url) + description = get_description_from_page(html) + if description: + catalog['description'] = description, + return + + for property_id in sorted(catalog_ids): + if property_id == 'P350': + continue # RKDimages ID + value = wikibase.first_datavalue(entity, property_id) + detail = lookup(property_id, value) + try: + html = get_catalog_page(property_id, value) + except (requests.exceptions.ConnectionError, requests.exceptions.SSLError): + continue # ignore this error + description = get_description_from_page(html) + if not description: + continue + catalog = { + 'institution': detail['label'], + 'description': description, + } + +def get_catalog_from_painting(entity): + catalog_ids = find_catalog_id(entity) + catalog_detail = [] + for property_id in sorted(catalog_ids): + value = wikibase.first_datavalue(entity, property_id) + detail = lookup(property_id, value) + catalog_detail.append(detail) + + catalog = { + 'url': wikibase.first_datavalue(entity, 'P973'), + 'detail': catalog_detail, + 'ids': catalog_ids, + } + + try: + check_catalog(entity, catalog) + except requests.exceptions.ReadTimeout: + pass + + return catalog + +def get_description_from_page(html): + root = lxml.html.fromstring(html) + div = root.find('.//div[@itemprop="description"]') + if div is not None: + return div.text + + div_list = root.find_class('item-description') + if len(div_list): + return div_list[0].text_content() + + meta_twitter_description = root.find('.//meta[@name="twitter:description"]') + if meta_twitter_description is None: + return + twitter_description = meta_twitter_description.get('content') + if not twitter_description: + return + twitter_description = twitter_description.strip() + + if not twitter_description: + return + + for element in root.getiterator(): + if not element.text: + continue + text = element.text.strip() + if not text: + continue + if text != twitter_description and text.startswith(twitter_description): + return text + + return twitter_description + +def get_catalog_page(property_id, value): + detail = lookup(property_id, value) + url = detail['url'] + catalog_id = value.replace('/', '_') + + filename = f'cache/{property_id}_{catalog_id}.html' + + if os.path.exists(filename): + html = open(filename, 'rb').read() + else: + r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2) + html = r.content + open(filename, 'wb').write(html) + + return html + +def get_catalog_url(url): + md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html' + filename = 'cache/' + md5_filename + + if os.path.exists(filename): + html = open(filename, 'rb').read() + else: + r = relaxed_ssl.get(url, + headers={'User-Agent': user_agent}, + timeout=2) + html = r.content + open(filename, 'wb').write(html) + + return html diff --git a/templates/item.html b/templates/item.html index dfaad7d..aa6e407 100644 --- a/templates/item.html +++ b/templates/item.html @@ -59,8 +59,8 @@ span.description { color: rgb(96, 96, 96); } {% endif %} {% endfor %} - {% if catalog_detail %} - {% for detail in catalog_detail %} + {% if catalog.detail %} + {% for detail in catalog.detail %}
{{ detail.label }}: {{ detail.value }} @@ -69,14 +69,14 @@ span.description { color: rgb(96, 96, 96); } {% endif %} - {% if catalog_url %} + {% if catalog.url %}

catalog URL: - {{ catalog_url }} + {{ catalog.url }}

{% endif %} - {% if catalog %} + {% if catalog.description or catalog.keywords %}

information from the {{ catalog.institution }} catalog

{% if catalog.description %}