Refactor: move catalog parse code to module

This commit is contained in:
Edward Betts 2019-10-09 16:11:43 +01:00
parent a5e5fa20df
commit 8cbd0f246f
2 changed files with 33 additions and 32 deletions

36
app.py
View file

@ -3,7 +3,8 @@
from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session from flask import Flask, render_template, url_for, redirect, request, g, jsonify, session
from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database, from depicts import (utils, wdqs, commons, mediawiki, painting, saam, database,
dia, rijksmuseum, npg, museodelprado, barnesfoundation, dia, rijksmuseum, npg, museodelprado, barnesfoundation,
wd_catalog, relaxed_ssl, human, wikibase, wikidata_oauth) wd_catalog, relaxed_ssl, human, wikibase, wikidata_oauth,
parse_catalog)
from depicts.pager import Pagination, init_pager from depicts.pager import Pagination, init_pager
from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem, from depicts.model import (DepictsItem, DepictsItemAltLabel, Edit, PaintingItem,
Language) Language)
@ -16,7 +17,6 @@ from collections import defaultdict
import hashlib import hashlib
import requests.exceptions import requests.exceptions
import requests import requests
import lxml.html
import json import json
import os import os
import locale import locale
@ -339,34 +339,6 @@ def get_catalog_url(url):
return html return html
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None:
return
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description
def existing_depicts_from_entity(entity): def existing_depicts_from_entity(entity):
if 'P180' not in entity['claims']: if 'P180' not in entity['claims']:
return [] return []
@ -459,7 +431,7 @@ def item_page(item_id):
if not catalog and catalog_url: if not catalog and catalog_url:
html = get_catalog_url(catalog_url) html = get_catalog_url(catalog_url)
description = get_description_from_page(html) description = parse_catalog.get_description_from_page(html)
if description: if description:
catalog = { catalog = {
'institution': institution, 'institution': institution,
@ -476,7 +448,7 @@ def item_page(item_id):
html = get_catalog_page(property_id, value) html = get_catalog_page(property_id, value)
except (requests.exceptions.ConnectionError, requests.exceptions.SSLError): except (requests.exceptions.ConnectionError, requests.exceptions.SSLError):
continue # ignore this error continue # ignore this error
description = get_description_from_page(html) description = parse_catalog.get_description_from_page(html)
if not description: if not description:
continue continue
catalog = { catalog = {

29
depicts/parse_catalog.py Normal file
View file

@ -0,0 +1,29 @@
import lxml.html
def get_description_from_page(html):
root = lxml.html.fromstring(html)
div = root.find('.//div[@itemprop="description"]')
if div is not None:
return div.text
meta_twitter_description = root.find('.//meta[@name="twitter:description"]')
if meta_twitter_description is None:
return
twitter_description = meta_twitter_description.get('content')
if not twitter_description:
return
twitter_description = twitter_description.strip()
if not twitter_description:
return
for element in root.getiterator():
if not element.text:
continue
text = element.text.strip()
if not text:
continue
if text != twitter_description and text.startswith(twitter_description):
return text
return twitter_description