from .url import get_url, get_text
from .parse import get_span, parse_span, parse_link, parse_sourcedoc_facet, xanadoc_span_html, span_html, get_urls
from collections import defaultdict
from html import escape
from .utils import protect_start_spaces
import re
re_comment = re.compile(r'#.*')
re_xanalink = re.compile('xanalink: +([^ ]+) *$')
max_sourcedoc_size = 600000
def fulfil_edl(edl):
text = {}
for url, start, length in parse_edl(edl)['spans']:
if url not in text:
text[url] = get_text(url)
yield get_span(text, url, start, length)
def parse_edl(edl_text):
edl = {
'spans': [],
'links': [],
}
for line in edl_text.splitlines():
line = re_comment.sub('', line).strip()
if not line:
continue
span_pointer = parse_span(line)
if span_pointer:
edl['spans'].append(span_pointer)
continue
m = re_xanalink.match(line)
if m:
link_url = m.group(1)
edl['links'].append({
'url': link_url,
'text': get_url(link_url),
})
continue
return edl
def fulfil_edl_with_sources(edl_text):
edl = parse_edl(edl_text)
spans = edl['spans']
hide_transclusions = set()
two_facet_links = []
link_num = 0
for link in edl['links']:
link_detail = parse_link(link['text'])
if link_detail['type'] == 'HideTransclusions':
hide_transclusions.add(parse_sourcedoc_facet(link_detail['facets'][0]))
elif len(link_detail['facets']) == 2:
two_facet_links.append((link_num, [parse_span(span[0]) for span in link_detail['facets']]))
link_num += 1
source = [get_text(url) for url in get_urls(spans)]
source_text = {s['url']: s['text'] for s in source}
source_doc_links = defaultdict(list)
for link_num, facets in two_facet_links:
for span in facets:
url, start, length = span
source_doc_links[url].append((start, length, link_num, 'link'))
if url in source_text:
continue
s = get_text(url)
source.append(s)
source_text[s['url']] = s['text']
for s in source_doc_links.values():
s.sort()
spans = list(enumerate(spans))
doc_spans = []
for num, (url, start, length) in spans:
highlight = url not in hide_transclusions
span_text = source_text[url] # [start:start + length]
new_text = ''
pos = start
for link_start, link_len, link_num, span_type in source_doc_links[url]:
link_end = link_start + link_len
if link_start >= start + length:
break
if link_end < start:
continue
open_tag = ''.format(link_num)
link_span = (open_tag +
escape(span_text[link_start:link_end]) +
'')
new_text += escape(span_text[pos:link_start]) + link_span
pos = link_end
new_text += escape(span_text[pos:start + length])
cur = xanadoc_span_html(num, new_text, url, start, length, highlight=highlight)
doc_spans.append(cur)
doc = ''.join(doc_spans)
for s in source:
text = protect_start_spaces(s.pop('text'))
if s['length'] > max_sourcedoc_size:
# print('{} > {}'.format(s['length'], max_sourcedoc_size))
continue
if s['url'] in hide_transclusions:
continue
source_spans = [(start, length, num, 'transclusion') for num, (url, start, length) in spans if url == s['url']]
source_spans += source_doc_links[s['url']]
source_spans.sort()
new_text = ''
pos = 0
for start, length, num, span_type in source_spans:
end = start + length
new_text += (escape(text[pos:start]) +
span_html(span_type, num) +
escape(text[start:end]) +
'')
pos = end
new_text += escape(text[pos:])
new_text = new_text.replace('\n', '
\n')
s['text'] = new_text
return {
'source': source,
'doc': doc.replace('\n', '
\n'),
'span_count': len(spans),
'link_count': len(two_facet_links),
}