Refactor: move catalog code again

This commit is contained in:
Edward Betts 2019-10-10 10:58:42 +01:00
parent 312cb255c4
commit d1ac75583b
4 changed files with 169 additions and 148 deletions

85
app.py
View file

@ -1,10 +1,8 @@
#!/usr/bin/python3 #!/usr/bin/python3
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database, from depicts import (utils, wdqs, commons, mediawiki, painting, database,
dia, rijksmuseum, npg, museodelprado, barnesfoundation, wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit)
wd_catalog, human, wikibase, wikidata_oauth, parse_catalog,
wikidata_edit)
from depicts.pager import Pagination, init_pager from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem, from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
Language) Language)
@ -14,8 +12,6 @@ from werkzeug.exceptions import InternalServerError
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from sqlalchemy import func, distinct from sqlalchemy import func, distinct
from collections import defaultdict from collections import defaultdict
import requests.exceptions
import requests
import json import json
import os import os
import locale import locale
@ -208,7 +204,6 @@ def random_painting():
@app.route('/oauth/start') @app.route('/oauth/start')
def start_oauth(): def start_oauth():
next_page = request.args.get('next') next_page = request.args.get('next')
if next_page: if next_page:
session['after_login'] = next_page session['after_login'] = next_page
@ -319,6 +314,14 @@ def existing_depicts_from_entity(entity):
existing.append(d) existing.append(d)
return existing return existing
def get_institution(entity, other):
if 'P276' in entity['claims']:
location = wikibase.first_datavalue(entity, 'P276')['id']
return other[location]
elif 'P195' in entity['claims']:
collection = wikibase.first_datavalue(entity, 'P195')['id']
return other[collection]
@app.route("/item/Q<int:item_id>") @app.route("/item/Q<int:item_id>")
def item_page(item_id): def item_page(item_id):
qid = f'Q{item_id}' qid = f'Q{item_id}'
@ -342,74 +345,14 @@ def item_page(item_id):
people = human.from_name(label) if label else None people = human.from_name(label) if label else None
if 'P276' in entity['claims']:
location = wikibase.first_datavalue(entity, 'P276')['id']
institution = other[location]
elif 'P195' in entity['claims']:
collection = wikibase.first_datavalue(entity, 'P195')['id']
institution = other[collection]
else:
institution = '???'
painting_item = PaintingItem.query.get(item_id) painting_item = PaintingItem.query.get(item_id)
if painting_item is None: if painting_item is None:
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity) painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
database.session.add(painting_item) database.session.add(painting_item)
catalog_ids = wd_catalog.find_catalog_id(entity) catalog = wd_catalog.get_catalog_from_painting(entity)
catalog_detail = [] if not catalog.get('institution'):
for property_id in sorted(catalog_ids): catalog['institution'] = get_institution(entity, other)
value = wikibase.first_datavalue(entity, property_id)
detail = wd_catalog.lookup(property_id, value)
catalog_detail.append(detail)
catalog_url = wikibase.first_datavalue(entity, 'P973')
catalog = None
try:
if 'P4704' in entity['claims']:
saam_id = wikibase.first_datavalue(entity, 'P4704')
catalog = saam.get_catalog(saam_id)
elif 'P4709' in entity['claims']:
catalog_id = wikibase.first_datavalue(entity, 'P4709')
catalog = barnesfoundation.get_catalog(catalog_id)
elif catalog_url and 'www.dia.org' in catalog_url:
catalog = dia.get_catalog(catalog_url)
elif catalog_url and 'www.rijksmuseum.nl' in catalog_url:
catalog = rijksmuseum.get_catalog(catalog_url)
elif catalog_url and 'www.npg.org.uk' in catalog_url:
catalog = npg.get_catalog(catalog_url)
elif catalog_url and 'www.museodelprado.es' in catalog_url:
catalog = museodelprado.get_catalog(catalog_url)
if not catalog and catalog_url:
html = parse_catalog.get_catalog_url(catalog_url)
description = parse_catalog.get_description_from_page(html)
if description:
catalog = {
'institution': institution,
'description': description,
}
if not catalog and catalog_ids:
for property_id in sorted(catalog_ids):
if property_id == 'P350':
continue # RKDimages ID
value = wikibase.first_datavalue(entity, property_id)
detail = wd_catalog.lookup(property_id, value)
try:
html = parse_catalog.get_catalog_page(property_id, value)
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
continue # ignore this error
description = parse_catalog.get_description_from_page(html)
if not description:
continue
catalog = {
'institution': detail['label'],
'description': description,
}
except requests.exceptions.ReadTimeout:
pass
label_languages = label_and_language['languages'] if label_and_language else [] label_languages = label_and_language['languages'] if label_and_language else []
show_translation_links = all(lang.code != 'en' for lang in label_languages) show_translation_links = all(lang.code != 'en' for lang in label_languages)
@ -418,8 +361,6 @@ def item_page(item_id):
item_id=item_id, item_id=item_id,
item=item, item=item,
catalog=catalog, catalog=catalog,
catalog_url=catalog_url,
catalog_detail=catalog_detail,
labels=find_more_props, labels=find_more_props,
entity=item.entity, entity=item.entity,
username=wikidata_oauth.get_username(), username=wikidata_oauth.get_username(),

View file

@ -1,71 +0,0 @@
from depicts import wd_catalog, relaxed_ssl
import lxml.html
import os.path
import requests
import hashlib
user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
div_list = root.find_class('item-description')
if len(div_list):
return div_list[0].text_content()
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None:
return
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description
def get_catalog_page(property_id, value):
detail = wd_catalog.lookup(property_id, value)
url = detail['url']
catalog_id = value.replace('/', '_')
filename = f'cache/{property_id}_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html
def get_catalog_url(url):
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
filename = 'cache/' + md5_filename
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html

View file

@ -1,3 +1,13 @@
from depicts import (wikibase, relaxed_ssl, saam, dia, rijksmuseum, npg,
museodelprado, barnesfoundation)
import requests
import requests.exceptions
import lxml.html
import os.path
import hashlib
user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
table = { table = {
'P347': ('Joconde ID', 'https://www.pop.culture.gouv.fr/notice/joconde/$1'), 'P347': ('Joconde ID', 'https://www.pop.culture.gouv.fr/notice/joconde/$1'),
'P350': ('RKDimages ID', 'https://rkd.nl/explore/images/$1'), 'P350': ('RKDimages ID', 'https://rkd.nl/explore/images/$1'),
@ -108,3 +118,144 @@ def lookup(property_id, value):
def find_catalog_id(entity): def find_catalog_id(entity):
return table.keys() & entity['claims'].keys() return table.keys() & entity['claims'].keys()
def check_catalog(entity, catalog):
catalog_url = catalog['url']
catalog_ids = catalog['ids']
if 'P4704' in entity['claims']:
saam_id = wikibase.first_datavalue(entity, 'P4704')
cat = saam.get_catalog(saam_id)
if cat:
catalog.update(cat)
return
if 'P4709' in entity['claims']:
catalog_id = wikibase.first_datavalue(entity, 'P4709')
cat = barnesfoundation.get_catalog(catalog_id)
if cat:
catalog.update(cat)
return
institutions = [
('www.dia.org', dia),
('www.rijksmuseum.nl', rijksmuseum),
('www.npg.org.uk', npg),
('www.museodelprado.es', museodelprado),
]
if catalog_url:
for host, module in institutions:
if host in catalog_url:
cat = module.get_catalog(catalog_url)
if not cat:
continue
catalog.update(cat)
return
html = get_catalog_url(catalog_url)
description = get_description_from_page(html)
if description:
catalog['description'] = description,
return
for property_id in sorted(catalog_ids):
if property_id == 'P350':
continue # RKDimages ID
value = wikibase.first_datavalue(entity, property_id)
detail = lookup(property_id, value)
try:
html = get_catalog_page(property_id, value)
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
continue # ignore this error
description = get_description_from_page(html)
if not description:
continue
catalog = {
'institution': detail['label'],
'description': description,
}
def get_catalog_from_painting(entity):
catalog_ids = find_catalog_id(entity)
catalog_detail = []
for property_id in sorted(catalog_ids):
value = wikibase.first_datavalue(entity, property_id)
detail = lookup(property_id, value)
catalog_detail.append(detail)
catalog = {
'url': wikibase.first_datavalue(entity, 'P973'),
'detail': catalog_detail,
'ids': catalog_ids,
}
try:
check_catalog(entity, catalog)
except requests.exceptions.ReadTimeout:
pass
return catalog
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
div_list = root.find_class('item-description')
if len(div_list):
return div_list[0].text_content()
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None:
return
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description
def get_catalog_page(property_id, value):
detail = lookup(property_id, value)
url = detail['url']
catalog_id = value.replace('/', '_')
filename = f'cache/{property_id}_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html
def get_catalog_url(url):
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
filename = 'cache/' + md5_filename
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html

View file

@ -59,8 +59,8 @@ span.description { color: rgb(96, 96, 96); }
</div> </div>
{% endif %} {% endif %}
{% endfor %} {% endfor %}
{% if catalog_detail %} {% if catalog.detail %}
{% for detail in catalog_detail %} {% for detail in catalog.detail %}
<div> <div>
<strong>{{ detail.label }}</strong>: <strong>{{ detail.label }}</strong>:
<a href="{{ detail.url }}">{{ detail.value }}</a> <a href="{{ detail.url }}">{{ detail.value }}</a>
@ -69,14 +69,14 @@ span.description { color: rgb(96, 96, 96); }
{% endif %} {% endif %}
{% if catalog_url %} {% if catalog.url %}
<p> <p>
<strong>catalog URL</strong>: <strong>catalog URL</strong>:
<a href="{{ catalog_url }}">{{ catalog_url }}</a> <a href="{{ catalog.url }}">{{ catalog.url }}</a>
</p> </p>
{% endif %} {% endif %}
{% if catalog %} {% if catalog.description or catalog.keywords %}
<div class="mt-2"> <div class="mt-2">
<h4>information from the {{ catalog.institution }} catalog</h4> <h4>information from the {{ catalog.institution }} catalog</h4>
{% if catalog.description %} {% if catalog.description %}