140 lines
4.3 KiB
Python
140 lines
4.3 KiB
Python
from .url import get_url, get_text
|
|
from .parse import get_span, parse_span, parse_link, parse_sourcedoc_facet, xanadoc_span_html, span_html, get_urls
|
|
from collections import defaultdict
|
|
from html import escape
|
|
from .utils import protect_start_spaces
|
|
|
|
import re
|
|
|
|
re_comment = re.compile(r'#.*')
|
|
re_xanalink = re.compile('xanalink: +([^ ]+) *$')
|
|
max_sourcedoc_size = 600000
|
|
|
|
def fulfil_edl(edl):
|
|
text = {}
|
|
for url, start, length in parse_edl(edl)['spans']:
|
|
if url not in text:
|
|
text[url] = get_text(url)
|
|
|
|
yield get_span(text, url, start, length)
|
|
|
|
def parse_edl(edl_text):
|
|
edl = {
|
|
'spans': [],
|
|
'links': [],
|
|
}
|
|
for line in edl_text.splitlines():
|
|
line = re_comment.sub('', line).strip()
|
|
if not line:
|
|
continue
|
|
span_pointer = parse_span(line)
|
|
if span_pointer:
|
|
edl['spans'].append(span_pointer)
|
|
continue
|
|
m = re_xanalink.match(line)
|
|
if m:
|
|
link_url = m.group(1)
|
|
edl['links'].append({
|
|
'url': link_url,
|
|
'text': get_url(link_url),
|
|
})
|
|
continue
|
|
|
|
return edl
|
|
|
|
def fulfil_edl_with_sources(edl_text):
|
|
edl = parse_edl(edl_text)
|
|
spans = edl['spans']
|
|
|
|
hide_transclusions = set()
|
|
|
|
two_facet_links = []
|
|
|
|
link_num = 0
|
|
for link in edl['links']:
|
|
link_detail = parse_link(link['text'])
|
|
if link_detail['type'] == 'HideTransclusions':
|
|
hide_transclusions.add(parse_sourcedoc_facet(link_detail['facets'][0]))
|
|
elif len(link_detail['facets']) == 2:
|
|
two_facet_links.append((link_num, [parse_span(span[0]) for span in link_detail['facets']]))
|
|
link_num += 1
|
|
|
|
source = [get_text(url) for url in get_urls(spans)]
|
|
|
|
source_text = {s['url']: s['text'] for s in source}
|
|
|
|
source_doc_links = defaultdict(list)
|
|
|
|
for link_num, facets in two_facet_links:
|
|
for span in facets:
|
|
url, start, length = span
|
|
source_doc_links[url].append((start, length, link_num, 'link'))
|
|
if url in source_text:
|
|
continue
|
|
|
|
s = get_text(url)
|
|
source.append(s)
|
|
source_text[s['url']] = s['text']
|
|
|
|
for s in source_doc_links.values():
|
|
s.sort()
|
|
|
|
spans = list(enumerate(spans))
|
|
|
|
doc_spans = []
|
|
for num, (url, start, length) in spans:
|
|
highlight = url not in hide_transclusions
|
|
span_text = source_text[url] # [start:start + length]
|
|
new_text = ''
|
|
pos = start
|
|
for link_start, link_len, link_num, span_type in source_doc_links[url]:
|
|
link_end = link_start + link_len
|
|
if link_start >= start + length:
|
|
break
|
|
if link_end < start:
|
|
continue
|
|
open_tag = '<span class="xanadoclink link" id="xanalink{}">'.format(link_num)
|
|
link_span = (open_tag +
|
|
escape(span_text[link_start:link_end]) +
|
|
'</span>')
|
|
new_text += escape(span_text[pos:link_start]) + link_span
|
|
pos = link_end
|
|
new_text += escape(span_text[pos:start + length])
|
|
cur = xanadoc_span_html(num, new_text, url, start, length, highlight=highlight)
|
|
doc_spans.append(cur)
|
|
|
|
doc = ''.join(doc_spans)
|
|
|
|
for s in source:
|
|
text = protect_start_spaces(s.pop('text'))
|
|
if s['length'] > max_sourcedoc_size:
|
|
# print('{} > {}'.format(s['length'], max_sourcedoc_size))
|
|
continue
|
|
if s['url'] in hide_transclusions:
|
|
continue
|
|
source_spans = [(start, length, num, 'transclusion') for num, (url, start, length) in spans if url == s['url']]
|
|
source_spans += source_doc_links[s['url']]
|
|
source_spans.sort()
|
|
|
|
new_text = ''
|
|
pos = 0
|
|
|
|
for start, length, num, span_type in source_spans:
|
|
end = start + length
|
|
new_text += (escape(text[pos:start]) +
|
|
span_html(span_type, num) +
|
|
escape(text[start:end]) +
|
|
'</span>')
|
|
pos = end
|
|
new_text += escape(text[pos:])
|
|
new_text = new_text.replace('\n', '<br/>\n')
|
|
|
|
s['text'] = new_text
|
|
|
|
return {
|
|
'source': source,
|
|
'doc': doc.replace('\n', '<br/>\n'),
|
|
'span_count': len(spans),
|
|
'link_count': len(two_facet_links),
|
|
}
|