Refactor: move more catalog code to module

This commit is contained in:
Edward Betts 2019-10-09 16:15:21 +01:00
parent 8cbd0f246f
commit 6a6b5ad373
2 changed files with 41 additions and 36 deletions

39
app.py
View file

@ -3,8 +3,7 @@
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database, from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
dia, rijksmuseum, npg, museodelprado, barnesfoundation, dia, rijksmuseum, npg, museodelprado, barnesfoundation,
wd_catalog, relaxed_ssl, human, wikibase, wikidata_oauth, wd_catalog, human, wikibase, wikidata_oauth, parse_catalog)
parse_catalog)
from depicts.pager import Pagination, init_pager from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem, from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
Language) Language)
@ -14,7 +13,6 @@ from werkzeug.exceptions import InternalServerError
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from sqlalchemy import func, distinct from sqlalchemy import func, distinct
from collections import defaultdict from collections import defaultdict
import hashlib
import requests.exceptions import requests.exceptions
import requests import requests
import json import json
@ -308,37 +306,6 @@ def image_with_cache(qid, image_filename, width):
return detail[image_filename] return detail[image_filename]
def get_catalog_page(property_id, value):
detail = wd_catalog.lookup(property_id, value)
url = detail['url']
catalog_id = value.replace('/', '_')
filename = f'cache/{property_id}_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html
def get_catalog_url(url):
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
filename = 'cache/' + md5_filename
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html
def existing_depicts_from_entity(entity): def existing_depicts_from_entity(entity):
if 'P180' not in entity['claims']: if 'P180' not in entity['claims']:
return [] return []
@ -430,7 +397,7 @@ def item_page(item_id):
catalog = museodelprado.get_catalog(catalog_url) catalog = museodelprado.get_catalog(catalog_url)
if not catalog and catalog_url: if not catalog and catalog_url:
html = get_catalog_url(catalog_url) html = parse_catalog.get_catalog_url(catalog_url)
description = parse_catalog.get_description_from_page(html) description = parse_catalog.get_description_from_page(html)
if description: if description:
catalog = { catalog = {
@ -445,7 +412,7 @@ def item_page(item_id):
value = wikibase.first_datavalue(entity, property_id) value = wikibase.first_datavalue(entity, property_id)
detail = wd_catalog.lookup(property_id, value) detail = wd_catalog.lookup(property_id, value)
try: try:
html = get_catalog_page(property_id, value) html = parse_catalog.get_catalog_page(property_id, value)
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError): except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
continue # ignore this error continue # ignore this error
description = parse_catalog.get_description_from_page(html) description = parse_catalog.get_description_from_page(html)

View file

@ -1,4 +1,11 @@
from depicts import wd_catalog, relaxed_ssl
import lxml.html import lxml.html
import os.path
import requests
import hashlib
user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:32.0) Gecko/20160101 Firefox/32.0'
def get_description_from_page(html): def get_description_from_page(html):
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
@ -27,3 +34,34 @@ def get_description_from_page(html):
return text return text
return twitter_description return twitter_description
def get_catalog_page(property_id, value):
detail = wd_catalog.lookup(property_id, value)
url = detail['url']
catalog_id = value.replace('/', '_')
filename = f'cache/{property_id}_{catalog_id}.html'
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = requests.get(url, headers={'User-Agent': user_agent}, timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html
def get_catalog_url(url):
md5_filename = hashlib.md5(url.encode('utf-8')).hexdigest() + '.html'
filename = 'cache/' + md5_filename
if os.path.exists(filename):
html = open(filename, 'rb').read()
else:
r = relaxed_ssl.get(url,
headers={'User-Agent': user_agent},
timeout=2)
html = r.content
open(filename, 'wb').write(html)
return html