Get description from catalog page.
This commit is contained in:
parent
514c55cd6e
commit
faf70a8811
55
app.py
55
app.py
|
@ -3,7 +3,7 @@
|
||||||
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
|
||||||
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
|
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
|
||||||
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
|
dia, rijksmuseum, npg, museodelprado, barnesfoundation,
|
||||||
wd_catalog)
|
wd_catalog, relaxed_ssl)
|
||||||
from depicts.pager import Pagination, init_pager
|
from depicts.pager import Pagination, init_pager
|
||||||
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
|
||||||
Language)
|
Language)
|
||||||
|
@ -14,6 +14,7 @@ from werkzeug.exceptions import InternalServerError
|
||||||
from werkzeug.debug.tbtools import get_current_traceback
|
from werkzeug.debug.tbtools import get_current_traceback
|
||||||
from sqlalchemy import func, distinct
|
from sqlalchemy import func, distinct
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
import hashlib
|
||||||
import requests.exceptions
|
import requests.exceptions
|
||||||
import requests
|
import requests
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
@ -400,12 +401,47 @@ def get_catalog_page(property_id, value):
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def get_catalog_url(url):
|
||||||
|
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
|
||||||
|
filename = 'cache/' + md5_filename
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
html = open(filename).read()
|
||||||
|
else:
|
||||||
|
r = relaxed_ssl.get(url,
|
||||||
|
headers={'User-Agent': user_agent},
|
||||||
|
timeout=2)
|
||||||
|
html = r.text
|
||||||
|
open(filename, 'w').write(html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
def get_description_from_page(html):
|
def get_description_from_page(html):
|
||||||
root = lxml.html.fromstring(html)
|
root = lxml.html.fromstring(html)
|
||||||
div = root.find('.//div[@itemprop="description"]')
|
div = root.find('.//div[@itemprop="description"]')
|
||||||
if div is not None:
|
if div is not None:
|
||||||
return div.text
|
return div.text
|
||||||
|
|
||||||
|
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
|
||||||
|
twitter_description = meta_twitter_description.get('content')
|
||||||
|
if not twitter_description:
|
||||||
|
return
|
||||||
|
twitter_description = twitter_description.strip()
|
||||||
|
|
||||||
|
if not twitter_description:
|
||||||
|
return
|
||||||
|
|
||||||
|
for element in root.getiterator():
|
||||||
|
if not element.text:
|
||||||
|
continue
|
||||||
|
text = element.text.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if text != twitter_description and text.startswith(twitter_description):
|
||||||
|
return text
|
||||||
|
|
||||||
|
return twitter_description
|
||||||
|
|
||||||
@app.route("/item/Q<int:item_id>")
|
@app.route("/item/Q<int:item_id>")
|
||||||
def item_page(item_id):
|
def item_page(item_id):
|
||||||
qid = f'Q{item_id}'
|
qid = f'Q{item_id}'
|
||||||
|
@ -421,6 +457,15 @@ def item_page(item_id):
|
||||||
label = label_and_language['label']
|
label = label_and_language['label']
|
||||||
other = get_other(item.entity)
|
other = get_other(item.entity)
|
||||||
|
|
||||||
|
if 'P276' in entity['claims']:
|
||||||
|
location = first_datavalue(entity, 'P276')['id']
|
||||||
|
institution = other[location]
|
||||||
|
elif 'P195' in entity['claims']:
|
||||||
|
collection = first_datavalue(entity, 'P195')['id']
|
||||||
|
institution = other[collection]
|
||||||
|
else:
|
||||||
|
institution = '???'
|
||||||
|
|
||||||
painting_item = PaintingItem.query.get(item_id)
|
painting_item = PaintingItem.query.get(item_id)
|
||||||
if painting_item is None:
|
if painting_item is None:
|
||||||
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
|
painting_item = PaintingItem(item_id=item_id, label=label, entity=entity)
|
||||||
|
@ -452,6 +497,14 @@ def item_page(item_id):
|
||||||
elif catalog_url and 'www.museodelprado.es' in catalog_url:
|
elif catalog_url and 'www.museodelprado.es' in catalog_url:
|
||||||
catalog = museodelprado.get_catalog(catalog_url)
|
catalog = museodelprado.get_catalog(catalog_url)
|
||||||
|
|
||||||
|
if not catalog and catalog_url:
|
||||||
|
html = get_catalog_url(catalog_url)
|
||||||
|
description = get_description_from_page(html)
|
||||||
|
catalog = {
|
||||||
|
'institution': institution,
|
||||||
|
'description': description,
|
||||||
|
}
|
||||||
|
|
||||||
if not catalog and catalog_ids:
|
if not catalog and catalog_ids:
|
||||||
for property_id in sorted(catalog_ids):
|
for property_id in sorted(catalog_ids):
|
||||||
if property_id == 'P350':
|
if property_id == 'P350':
|
||||||
|
|
16
depicts/relaxed_ssl.py
Normal file
16
depicts/relaxed_ssl.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
|
||||||
|
|
||||||
|
CIPHERS = 'DEFAULT@SECLEVEL=1'
|
||||||
|
|
||||||
|
class HTTPSAdapter(HTTPAdapter):
|
||||||
|
def init_poolmanager(self, *args, **kwargs):
|
||||||
|
context = create_urllib3_context(ciphers=CIPHERS)
|
||||||
|
kwargs['ssl_context'] = context
|
||||||
|
return super().init_poolmanager(*args, **kwargs)
|
||||||
|
|
||||||
|
def get(*args, **kwargs):
|
||||||
|
s = requests.Session()
|
||||||
|
s.mount('https://', HTTPSAdapter())
|
||||||
|
return s.get(*args, **kwargs)
|
Loading…
Reference in a new issue