Initial commit
This commit is contained in:
commit
f8454fa295
274
parse.py
Executable file
274
parse.py
Executable file
|
@ -0,0 +1,274 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import typing
|
||||||
|
import urllib.parse
|
||||||
|
from datetime import time
|
||||||
|
|
||||||
|
import lxml.etree
|
||||||
|
|
||||||
|
event_id = 1
|
||||||
|
presenter_id = 1
|
||||||
|
|
||||||
|
|
||||||
|
def md5sum(s: str) -> str:
|
||||||
|
"""Generate hex md5sum."""
|
||||||
|
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
re_day_heading = re.compile(
|
||||||
|
r'<h2 style="border-bottom: 0px;"><b>'
|
||||||
|
+ r'<span style="color: #2873b3;"> .*(.*?) *</span></b></h2>'
|
||||||
|
)
|
||||||
|
|
||||||
|
re_time = re.compile(r"^\| ?(\d{2}):(\d{2})")
|
||||||
|
url = "https://meta.wikimedia.org/wiki/GLAM_Wiki_2023/Program/The_CC_Certificate_for_GLAM:_learn_about_it_by_becoming_part_of_a_human_sculpture_collection"
|
||||||
|
|
||||||
|
# re_session = re.compile(r'\| colspan="\d+" rowspan="\d+" \|\[\[(.*)\|(.*)\]\]')
|
||||||
|
re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)')
|
||||||
|
re_colspan = re.compile(r'\| colspan="(\d+)')
|
||||||
|
|
||||||
|
re_speaker = re.compile(r"<small>(.*)</small>")
|
||||||
|
|
||||||
|
|
||||||
|
class Session(typing.TypedDict):
|
||||||
|
"""Session."""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
start: time
|
||||||
|
duration: int
|
||||||
|
room: str
|
||||||
|
speakers: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
GroupedSlots = dict[int, dict[str, list[Session]]]
|
||||||
|
|
||||||
|
|
||||||
|
def content_to_url(s: str) -> str:
|
||||||
|
try:
|
||||||
|
assert s.startswith("[[GLAM")
|
||||||
|
except AssertionError:
|
||||||
|
print(s)
|
||||||
|
raise
|
||||||
|
page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_"))
|
||||||
|
return "https://meta.wikimedia.org/wiki/" + page_title
|
||||||
|
|
||||||
|
|
||||||
|
meta = [
|
||||||
|
("title", "GLAM Wiki 2023"),
|
||||||
|
("subtitle", "Galleries, Libraries, Archives, Museums, etc."),
|
||||||
|
("venue", "University of the Republic of Uruguay"),
|
||||||
|
("city", "Montevideo, Uruguay"),
|
||||||
|
("start", "2023-11-16"),
|
||||||
|
("end", "2023-11-18"),
|
||||||
|
("days", 3),
|
||||||
|
("day_change", "08:00"),
|
||||||
|
("timeslot_duration", "00:15"),
|
||||||
|
("time_zone_name", "America/Montevideo"),
|
||||||
|
]
|
||||||
|
|
||||||
|
rooms = [
|
||||||
|
"Auditorium",
|
||||||
|
"Posgrado 1 (110)",
|
||||||
|
"Posgrado 2 (111)",
|
||||||
|
"401",
|
||||||
|
"410",
|
||||||
|
"411",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def minutes_to_duration(mins: int) -> str:
|
||||||
|
"""Convert minutes to duration string."""
|
||||||
|
return f"{mins // 60:02d}:{mins % 60:02d}"
|
||||||
|
|
||||||
|
|
||||||
|
def build_event(item: Session, room: lxml.etree._Element) -> None:
|
||||||
|
global event_id
|
||||||
|
"""Build an event element."""
|
||||||
|
if item["name"][0] != "[" or item["name"].startswith("[[Event:"):
|
||||||
|
return None
|
||||||
|
name: str = item["name"].partition("|")[2][:-2]
|
||||||
|
event_data = [
|
||||||
|
("start", str(item["start"])),
|
||||||
|
("duration", minutes_to_duration(item["duration"])),
|
||||||
|
("room", item["room"]),
|
||||||
|
("title", name),
|
||||||
|
("subtitle", ""),
|
||||||
|
# ("track", item["track"] or ""),
|
||||||
|
("language", ""),
|
||||||
|
# ("abstract", item["abstract"]),
|
||||||
|
("description", ""),
|
||||||
|
("url", content_to_url(item["name"])),
|
||||||
|
]
|
||||||
|
|
||||||
|
event = lxml.etree.SubElement(room, "event", id=str(event_id))
|
||||||
|
event_id += 1
|
||||||
|
|
||||||
|
for key, value in event_data:
|
||||||
|
lxml.etree.SubElement(event, key).text = value
|
||||||
|
|
||||||
|
persons = lxml.etree.SubElement(event, "persons")
|
||||||
|
for s in item["speakers"]:
|
||||||
|
lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s
|
||||||
|
|
||||||
|
|
||||||
|
def read_wikitext(filename: str) -> GroupedSlots:
|
||||||
|
group_by_day_and_room: GroupedSlots = {
|
||||||
|
index: {room: [] for room in rooms} for index in range(1, 5)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect = "h2" # state machine
|
||||||
|
start_time = None
|
||||||
|
day_index = 0
|
||||||
|
col: int = 0
|
||||||
|
current: Session | None = None
|
||||||
|
room: str | None = None
|
||||||
|
for line in open(filename):
|
||||||
|
if expect == "h2" and line.startswith("<h2 "):
|
||||||
|
m = re_day_heading.match(line)
|
||||||
|
assert m
|
||||||
|
day_index += 1
|
||||||
|
expect = "wikidatabe"
|
||||||
|
continue
|
||||||
|
if expect == "wikidatabe":
|
||||||
|
assert line.startswith('{| class="wikitable"')
|
||||||
|
expect = "session"
|
||||||
|
continue
|
||||||
|
if expect == "session" and line == "|-\n":
|
||||||
|
expect = "time"
|
||||||
|
continue
|
||||||
|
if expect == "heading" and line == "|-\n":
|
||||||
|
expect = "time"
|
||||||
|
continue
|
||||||
|
if expect == "time":
|
||||||
|
if line == "|\n" or line.startswith('| rowspan="3" |'):
|
||||||
|
expect = "heading"
|
||||||
|
continue
|
||||||
|
if "Detailed program" in line:
|
||||||
|
expect = "session"
|
||||||
|
continue
|
||||||
|
if line[0] == "!":
|
||||||
|
expect = "heading"
|
||||||
|
continue
|
||||||
|
if line == '| rowspan="3" |\n':
|
||||||
|
expect = "heading"
|
||||||
|
continue
|
||||||
|
if line != "|\n":
|
||||||
|
m = re_time.match(line)
|
||||||
|
if not m:
|
||||||
|
print(repr(line))
|
||||||
|
assert m
|
||||||
|
t = time(int(m.group(1)), int(m.group(2)))
|
||||||
|
start_time = t
|
||||||
|
col = 1
|
||||||
|
# print("time: ", t)
|
||||||
|
expect = "session"
|
||||||
|
continue
|
||||||
|
if expect == "session" and line == "|\n":
|
||||||
|
col += 1
|
||||||
|
|
||||||
|
if (
|
||||||
|
expect == "session"
|
||||||
|
and "small" in line
|
||||||
|
and (line.startswith("''") or line.startswith("<small"))
|
||||||
|
):
|
||||||
|
assert current and isinstance(current["speakers"], list)
|
||||||
|
m = re_speaker.search(line)
|
||||||
|
if not m:
|
||||||
|
print(line)
|
||||||
|
assert m
|
||||||
|
speaker_name = m.group(1).strip("'")
|
||||||
|
if speaker_name.startswith("("):
|
||||||
|
assert current["speakers"]
|
||||||
|
current["speakers"][-1] += " " + speaker_name
|
||||||
|
else:
|
||||||
|
current["speakers"].append(speaker_name)
|
||||||
|
# print(current)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (
|
||||||
|
expect == "session"
|
||||||
|
and line.startswith("| colspan")
|
||||||
|
or line.startswith('| rowspan="')
|
||||||
|
):
|
||||||
|
room = rooms[col - 1]
|
||||||
|
if line.startswith('| rowspan="'):
|
||||||
|
colspan = 1
|
||||||
|
else:
|
||||||
|
m = re_colspan.match(line)
|
||||||
|
assert m
|
||||||
|
colspan = int(m.group(1))
|
||||||
|
col += colspan
|
||||||
|
|
||||||
|
if re.match(r'\| rowspan="\d" \|\n', line):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line == '| colspan="1" |\n':
|
||||||
|
continue
|
||||||
|
if 'style="background-color:#' in line:
|
||||||
|
continue
|
||||||
|
m = re_session.match(line)
|
||||||
|
if not m:
|
||||||
|
print(repr(line))
|
||||||
|
assert m
|
||||||
|
content = m.group(2)
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
duration = 15 * int(m.group(1))
|
||||||
|
|
||||||
|
assert start_time
|
||||||
|
# print(day_index, start_time, col, room, duration, content)
|
||||||
|
current = {
|
||||||
|
"name": content,
|
||||||
|
"start": start_time,
|
||||||
|
"duration": duration,
|
||||||
|
"room": room,
|
||||||
|
"speakers": [],
|
||||||
|
}
|
||||||
|
group_by_day_and_room[day_index][room].append(current)
|
||||||
|
|
||||||
|
if expect == "session" and line == "|}\n":
|
||||||
|
expect = "h2"
|
||||||
|
continue
|
||||||
|
|
||||||
|
return group_by_day_and_room
|
||||||
|
|
||||||
|
|
||||||
|
def generate_schedule(grouped: GroupedSlots) -> lxml.etree._Element:
|
||||||
|
"""Generate the schedule XML."""
|
||||||
|
root = lxml.etree.Element("schedule")
|
||||||
|
conf = lxml.etree.SubElement(root, "conference")
|
||||||
|
|
||||||
|
for key, value in meta:
|
||||||
|
lxml.etree.SubElement(conf, key).text = str(value)
|
||||||
|
|
||||||
|
days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)]
|
||||||
|
for index, d in days:
|
||||||
|
day = lxml.etree.SubElement(root, "day")
|
||||||
|
day.set("index", str(index))
|
||||||
|
day.set("date", d)
|
||||||
|
|
||||||
|
for room_name in rooms:
|
||||||
|
room = lxml.etree.SubElement(day, "room", name=room_name)
|
||||||
|
|
||||||
|
for slot in grouped[index][room_name]:
|
||||||
|
build_event(slot, room)
|
||||||
|
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
def main(filename: str) -> None:
|
||||||
|
"""Parse JSON and convert to XML."""
|
||||||
|
root = generate_schedule(read_wikitext(filename))
|
||||||
|
|
||||||
|
as_xml = lxml.etree.tostring(
|
||||||
|
root, xml_declaration=True, encoding="utf-8", pretty_print=True
|
||||||
|
)
|
||||||
|
|
||||||
|
print(as_xml.decode("utf-8"), end="")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(sys.argv[1])
|
Loading…
Reference in a new issue