sourcing/sourcing/url.py
2017-02-20 11:10:22 +00:00

43 lines
960 B
Python

import requests
from .model import Item
import os.path
import re
project_dir = os.path.dirname(os.path.dirname(__file__))
cache_location = os.path.join(project_dir, 'cache')
re_colon_slash = re.compile('[/:]+')
def url_filename(url):
return re_colon_slash.sub('_', url)
def get_text(url):
# assume UTF-8
text = get_url(url)
heading = url.rsplit('/', 1)[-1]
return {
'url': url,
'text': text,
'heading': heading,
'length': len(text),
}
def get_url(url):
item = Item.from_external(url)
if item:
return item.text
content = requests.get(url).content
return content.decode(errors='replace')
filename = os.path.join(cache_location, url_filename(url))
if os.path.exists(filename):
content = open(filename, 'rb').read()
else:
content = requests.get(url).content
open(filename, 'wb').write(content)
return content.decode(errors='replace')