Refactor: move catalog code again
This commit is contained in:
parent
312cb255c4
commit
d1ac75583b
85
app.py
85
app.py
|
@ -1,10 +1,8 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
||||||
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
|
from depicts import (utils, wdqs, commons, mediawiki, painting, database,
|
||||||
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
|
wd_catalog, human, wikibase, wikidata_oauth, wikidata_edit)
|
||||||
wd_catalog, human, wikibase, wikidata_oauth, parse_catalog,
|
|
||||||
wikidata_edit)
|
|
||||||
from depicts.pager import Pagination, init_pager
|
from depicts.pager import Pagination, init_pager
|
||||||
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
||||||
Language)
|
Language)
|
||||||
|
@ -14,8 +12,6 @@ from werkzeug.exceptions import InternalServerError
|
||||||
from werkzeug.debug.tbtools import get_current_traceback
|
from werkzeug.debug.tbtools import get_current_traceback
|
||||||
from sqlalchemy import func, distinct
|
from sqlalchemy import func, distinct
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import requests.exceptions
|
|
||||||
import requests
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import locale
|
import locale
|
||||||
|
@ -208,7 +204,6 @@ def random_painting():
|
||||||
|
|
||||||
@app.route('/oauth/start')
|
@app.route('/oauth/start')
|
||||||
def start_oauth():
|
def start_oauth():
|
||||||
|
|
||||||
next_page = request.args.get('next')
|
next_page = request.args.get('next')
|
||||||
if next_page:
|
if next_page:
|
||||||
session['after_login'] = next_page
|
session['after_login'] = next_page
|
||||||
|
@ -319,6 +314,14 @@ def existing_depicts_from_entity(entity):
|
||||||
existing.append(d)
|
existing.append(d)
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
|
def get_institution(entity, other):
|
||||||
|
if 'P276' in entity['claims']:
|
||||||
|
location = wikibase.first_datavalue(entity, 'P276')['id']
|
||||||
|
return other[location]
|
||||||
|
elif 'P195' in entity['claims']:
|
||||||
|
collection = wikibase.first_datavalue(entity, 'P195')['id']
|
||||||
|
return other[collection]
|
||||||
|
|
||||||
@app.route("/item/Q<int:item_id>")
|
@app.route("/item/Q<int:item_id>")
|
||||||
def item_page(item_id):
|
def item_page(item_id):
|
||||||
qid = f'Q{item_id}'
|
qid = f'Q{item_id}'
|
||||||
|
@ -342,74 +345,14 @@ def item_page(item_id):
|
||||||
|
|
||||||
people = human.from_name(label) if label else None
|
people = human.from_name(label) if label else None
|
||||||
|
|
||||||
if 'P276' in entity['claims']:
|
|
||||||
location = wikibase.first_datavalue(entity, 'P276')['id']
|
|
||||||
institution = other[location]
|
|
||||||
elif 'P195' in entity['claims']:
|
|
||||||
collection = wikibase.first_datavalue(entity, 'P195')['id']
|
|
||||||
institution = other[collection]
|
|
||||||
else:
|
|
||||||
institution = '???'
|
|
||||||
|
|
||||||
painting_item = PaintingItem.query.get(item_id)
|
painting_item = PaintingItem.query.get(item_id)
|
||||||
if painting_item is None:
|
if painting_item is None:
|
||||||
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
|
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
|
||||||
database.session.add(painting_item)
|
database.session.add(painting_item)
|
||||||
|
|
||||||
catalog_ids = wd_catalog.find_catalog_id(entity)
|
catalog = wd_catalog.get_catalog_from_painting(entity)
|
||||||
catalog_detail = []
|
if not catalog.get('institution'):
|
||||||
for property_id in sorted(catalog_ids):
|
catalog['institution'] = get_institution(entity, other)
|
||||||
value = wikibase.first_datavalue(entity, property_id)
|
|
||||||
detail = wd_catalog.lookup(property_id, value)
|
|
||||||
catalog_detail.append(detail)
|
|
||||||
|
|
||||||
catalog_url = wikibase.first_datavalue(entity, 'P973')
|
|
||||||
|
|
||||||
catalog = None
|
|
||||||
try:
|
|
||||||
if 'P4704' in entity['claims']:
|
|
||||||
saam_id = wikibase.first_datavalue(entity, 'P4704')
|
|
||||||
catalog = saam.get_catalog(saam_id)
|
|
||||||
elif 'P4709' in entity['claims']:
|
|
||||||
catalog_id = wikibase.first_datavalue(entity, 'P4709')
|
|
||||||
catalog = barnesfoundation.get_catalog(catalog_id)
|
|
||||||
elif catalog_url and 'www.dia.org' in catalog_url:
|
|
||||||
catalog = dia.get_catalog(catalog_url)
|
|
||||||
elif catalog_url and 'www.rijksmuseum.nl' in catalog_url:
|
|
||||||
catalog = rijksmuseum.get_catalog(catalog_url)
|
|
||||||
elif catalog_url and 'www.npg.org.uk' in catalog_url:
|
|
||||||
catalog = npg.get_catalog(catalog_url)
|
|
||||||
elif catalog_url and 'www.museodelprado.es' in catalog_url:
|
|
||||||
catalog = museodelprado.get_catalog(catalog_url)
|
|
||||||
|
|
||||||
if not catalog and catalog_url:
|
|
||||||
html = parse_catalog.get_catalog_url(catalog_url)
|
|
||||||
description = parse_catalog.get_description_from_page(html)
|
|
||||||
if description:
|
|
||||||
catalog = {
|
|
||||||
'institution': institution,
|
|
||||||
'description': description,
|
|
||||||
}
|
|
||||||
|
|
||||||
if not catalog and catalog_ids:
|
|
||||||
for property_id in sorted(catalog_ids):
|
|
||||||
if property_id == 'P350':
|
|
||||||
continue # RKDimages ID
|
|
||||||
value = wikibase.first_datavalue(entity, property_id)
|
|
||||||
detail = wd_catalog.lookup(property_id, value)
|
|
||||||
try:
|
|
||||||
html = parse_catalog.get_catalog_page(property_id, value)
|
|
||||||
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
|
|
||||||
continue # ignore this error
|
|
||||||
description = parse_catalog.get_description_from_page(html)
|
|
||||||
if not description:
|
|
||||||
continue
|
|
||||||
catalog = {
|
|
||||||
'institution': detail['label'],
|
|
||||||
'description': description,
|
|
||||||
}
|
|
||||||
except requests.exceptions.ReadTimeout:
|
|
||||||
pass
|
|
||||||
|
|
||||||
label_languages = label_and_language['languages'] if label_and_language else []
|
label_languages = label_and_language['languages'] if label_and_language else []
|
||||||
show_translation_links = all(lang.code != 'en' for lang in label_languages)
|
show_translation_links = all(lang.code != 'en' for lang in label_languages)
|
||||||
|
@ -418,8 +361,6 @@ def item_page(item_id):
|
||||||
item_id=item_id,
|
item_id=item_id,
|
||||||
item=item,
|
item=item,
|
||||||
catalog=catalog,
|
catalog=catalog,
|
||||||
catalog_url=catalog_url,
|
|
||||||
catalog_detail=catalog_detail,
|
|
||||||
labels=find_more_props,
|
labels=find_more_props,
|
||||||
entity=item.entity,
|
entity=item.entity,
|
||||||
username=wikidata_oauth.get_username(),
|
username=wikidata_oauth.get_username(),
|
||||||
|
|
|
@ -1,71 +0,0 @@
|
||||||
from depicts import wd_catalog, relaxed_ssl
|
|
||||||
|
|
||||||
import lxml.html
|
|
||||||
import os.path
|
|
||||||
import requests
|
|
||||||
import hashlib
|
|
||||||
|
|
||||||
user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
|
|
||||||
|
|
||||||
def get_description_from_page(html):
|
|
||||||
root = lxml.html.fromstring(html)
|
|
||||||
div = root.find('.//div[@itemprop="description"]')
|
|
||||||
if div is not None:
|
|
||||||
return div.text
|
|
||||||
|
|
||||||
div_list = root.find_class('item-description')
|
|
||||||
if len(div_list):
|
|
||||||
return div_list[0].text_content()
|
|
||||||
|
|
||||||
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
|
|
||||||
if meta_twitter_description is None:
|
|
||||||
return
|
|
||||||
twitter_description = meta_twitter_description.get('content')
|
|
||||||
if not twitter_description:
|
|
||||||
return
|
|
||||||
twitter_description = twitter_description.strip()
|
|
||||||
|
|
||||||
if not twitter_description:
|
|
||||||
return
|
|
||||||
|
|
||||||
for element in root.getiterator():
|
|
||||||
if not element.text:
|
|
||||||
continue
|
|
||||||
text = element.text.strip()
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
if text != twitter_description and text.startswith(twitter_description):
|
|
||||||
return text
|
|
||||||
|
|
||||||
return twitter_description
|
|
||||||
|
|
||||||
def get_catalog_page(property_id, value):
|
|
||||||
detail = wd_catalog.lookup(property_id, value)
|
|
||||||
url = detail['url']
|
|
||||||
catalog_id = value.replace('/', '_')
|
|
||||||
|
|
||||||
filename = f'cache/{property_id}_{catalog_id}.html'
|
|
||||||
|
|
||||||
if os.path.exists(filename):
|
|
||||||
html = open(filename, 'rb').read()
|
|
||||||
else:
|
|
||||||
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
|
|
||||||
html = r.content
|
|
||||||
open(filename, 'wb').write(html)
|
|
||||||
|
|
||||||
return html
|
|
||||||
|
|
||||||
def get_catalog_url(url):
|
|
||||||
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
|
|
||||||
filename = 'cache/' + md5_filename
|
|
||||||
|
|
||||||
if os.path.exists(filename):
|
|
||||||
html = open(filename, 'rb').read()
|
|
||||||
else:
|
|
||||||
r = relaxed_ssl.get(url,
|
|
||||||
headers={'User-Agent': user_agent},
|
|
||||||
timeout=2)
|
|
||||||
html = r.content
|
|
||||||
open(filename, 'wb').write(html)
|
|
||||||
|
|
||||||
return html
|
|
|
@ -1,3 +1,13 @@
|
||||||
|
from depicts import (wikibase, relaxed_ssl, saam, dia, rijksmuseum, npg,
|
||||||
|
museodelprado, barnesfoundation)
|
||||||
|
import requests
|
||||||
|
import requests.exceptions
|
||||||
|
import lxml.html
|
||||||
|
import os.path
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
|
||||||
|
|
||||||
table = {
|
table = {
|
||||||
'P347': ('Joconde ID', 'https://www.pop.culture.gouv.fr/notice/joconde/$1'),
|
'P347': ('Joconde ID', 'https://www.pop.culture.gouv.fr/notice/joconde/$1'),
|
||||||
'P350': ('RKDimages ID', 'https://rkd.nl/explore/images/$1'),
|
'P350': ('RKDimages ID', 'https://rkd.nl/explore/images/$1'),
|
||||||
|
@ -108,3 +118,144 @@ def lookup(property_id, value):
|
||||||
|
|
||||||
def find_catalog_id(entity):
|
def find_catalog_id(entity):
|
||||||
return table.keys() & entity['claims'].keys()
|
return table.keys() & entity['claims'].keys()
|
||||||
|
|
||||||
|
def check_catalog(entity, catalog):
|
||||||
|
catalog_url = catalog['url']
|
||||||
|
catalog_ids = catalog['ids']
|
||||||
|
|
||||||
|
if 'P4704' in entity['claims']:
|
||||||
|
saam_id = wikibase.first_datavalue(entity, 'P4704')
|
||||||
|
cat = saam.get_catalog(saam_id)
|
||||||
|
if cat:
|
||||||
|
catalog.update(cat)
|
||||||
|
return
|
||||||
|
|
||||||
|
if 'P4709' in entity['claims']:
|
||||||
|
catalog_id = wikibase.first_datavalue(entity, 'P4709')
|
||||||
|
cat = barnesfoundation.get_catalog(catalog_id)
|
||||||
|
if cat:
|
||||||
|
catalog.update(cat)
|
||||||
|
return
|
||||||
|
|
||||||
|
institutions = [
|
||||||
|
('www.dia.org', dia),
|
||||||
|
('www.rijksmuseum.nl', rijksmuseum),
|
||||||
|
('www.npg.org.uk', npg),
|
||||||
|
('www.museodelprado.es', museodelprado),
|
||||||
|
]
|
||||||
|
|
||||||
|
if catalog_url:
|
||||||
|
for host, module in institutions:
|
||||||
|
if host in catalog_url:
|
||||||
|
cat = module.get_catalog(catalog_url)
|
||||||
|
if not cat:
|
||||||
|
continue
|
||||||
|
catalog.update(cat)
|
||||||
|
return
|
||||||
|
|
||||||
|
html = get_catalog_url(catalog_url)
|
||||||
|
description = get_description_from_page(html)
|
||||||
|
if description:
|
||||||
|
catalog['description'] = description,
|
||||||
|
return
|
||||||
|
|
||||||
|
for property_id in sorted(catalog_ids):
|
||||||
|
if property_id == 'P350':
|
||||||
|
continue # RKDimages ID
|
||||||
|
value = wikibase.first_datavalue(entity, property_id)
|
||||||
|
detail = lookup(property_id, value)
|
||||||
|
try:
|
||||||
|
html = get_catalog_page(property_id, value)
|
||||||
|
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
|
||||||
|
continue # ignore this error
|
||||||
|
description = get_description_from_page(html)
|
||||||
|
if not description:
|
||||||
|
continue
|
||||||
|
catalog = {
|
||||||
|
'institution': detail['label'],
|
||||||
|
'description': description,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_catalog_from_painting(entity):
|
||||||
|
catalog_ids = find_catalog_id(entity)
|
||||||
|
catalog_detail = []
|
||||||
|
for property_id in sorted(catalog_ids):
|
||||||
|
value = wikibase.first_datavalue(entity, property_id)
|
||||||
|
detail = lookup(property_id, value)
|
||||||
|
catalog_detail.append(detail)
|
||||||
|
|
||||||
|
catalog = {
|
||||||
|
'url': wikibase.first_datavalue(entity, 'P973'),
|
||||||
|
'detail': catalog_detail,
|
||||||
|
'ids': catalog_ids,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_catalog(entity, catalog)
|
||||||
|
except requests.exceptions.ReadTimeout:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return catalog
|
||||||
|
|
||||||
|
def get_description_from_page(html):
|
||||||
|
root = lxml.html.fromstring(html)
|
||||||
|
div = root.find('.//div[@itemprop="description"]')
|
||||||
|
if div is not None:
|
||||||
|
return div.text
|
||||||
|
|
||||||
|
div_list = root.find_class('item-description')
|
||||||
|
if len(div_list):
|
||||||
|
return div_list[0].text_content()
|
||||||
|
|
||||||
|
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
|
||||||
|
if meta_twitter_description is None:
|
||||||
|
return
|
||||||
|
twitter_description = meta_twitter_description.get('content')
|
||||||
|
if not twitter_description:
|
||||||
|
return
|
||||||
|
twitter_description = twitter_description.strip()
|
||||||
|
|
||||||
|
if not twitter_description:
|
||||||
|
return
|
||||||
|
|
||||||
|
for element in root.getiterator():
|
||||||
|
if not element.text:
|
||||||
|
continue
|
||||||
|
text = element.text.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if text != twitter_description and text.startswith(twitter_description):
|
||||||
|
return text
|
||||||
|
|
||||||
|
return twitter_description
|
||||||
|
|
||||||
|
def get_catalog_page(property_id, value):
|
||||||
|
detail = lookup(property_id, value)
|
||||||
|
url = detail['url']
|
||||||
|
catalog_id = value.replace('/', '_')
|
||||||
|
|
||||||
|
filename = f'cache/{property_id}_{catalog_id}.html'
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
html = open(filename, 'rb').read()
|
||||||
|
else:
|
||||||
|
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
|
||||||
|
html = r.content
|
||||||
|
open(filename, 'wb').write(html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
|
def get_catalog_url(url):
|
||||||
|
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
|
||||||
|
filename = 'cache/' + md5_filename
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
html = open(filename, 'rb').read()
|
||||||
|
else:
|
||||||
|
r = relaxed_ssl.get(url,
|
||||||
|
headers={'User-Agent': user_agent},
|
||||||
|
timeout=2)
|
||||||
|
html = r.content
|
||||||
|
open(filename, 'wb').write(html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
|
@ -59,8 +59,8 @@ span.description { color: rgb(96, 96, 96); }
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% if catalog_detail %}
|
{% if catalog.detail %}
|
||||||
{% for detail in catalog_detail %}
|
{% for detail in catalog.detail %}
|
||||||
<div>
|
<div>
|
||||||
<strong>{{ detail.label }}</strong>:
|
<strong>{{ detail.label }}</strong>:
|
||||||
<a href="{{ detail.url }}">{{ detail.value }}</a>
|
<a href="{{ detail.url }}">{{ detail.value }}</a>
|
||||||
|
@ -69,14 +69,14 @@ span.description { color: rgb(96, 96, 96); }
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
|
||||||
{% if catalog_url %}
|
{% if catalog.url %}
|
||||||
<p>
|
<p>
|
||||||
<strong>catalog URL</strong>:
|
<strong>catalog URL</strong>:
|
||||||
<a href="{{ catalog_url }}">{{ catalog_url }}</a>
|
<a href="{{ catalog.url }}">{{ catalog.url }}</a>
|
||||||
</p>
|
</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% if catalog %}
|
{% if catalog.description or catalog.keywords %}
|
||||||
<div class="mt-2">
|
<div class="mt-2">
|
||||||
<h4>information from the {{ catalog.institution }} catalog</h4>
|
<h4>information from the {{ catalog.institution }} catalog</h4>
|
||||||
{% if catalog.description %}
|
{% if catalog.description %}
|
||||||
|
|
Loading…
Reference in a new issue