#!/usr/bin/python3 """Convert from wiki page schedule to XML schedule.""" import hashlib import re import sys import typing import urllib.parse from datetime import time import lxml.etree event_id = 1 presenter_id = 1 def md5sum(s: str) -> str: """Generate hex md5sum.""" return hashlib.md5(s.encode("utf-8")).hexdigest() re_day_heading = re.compile( r'

' + r' .*(.*?) *

' ) re_time = re.compile(r"^\| ?(\d{2}):(\d{2})") re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)') re_colspan = re.compile(r'\| colspan="(\d+)') re_speaker = re.compile(r"(.*)") class Session(typing.TypedDict): """Session.""" name: str start: time duration: int room: str speakers: list[str] GroupedSlots = dict[int, dict[str, list[Session]]] meta = [ ("title", "GLAM Wiki 2023"), ("subtitle", "Galleries, Libraries, Archives, Museums, etc."), ("venue", "University of the Republic of Uruguay"), ("city", "Montevideo, Uruguay"), ("start", "2023-11-16"), ("end", "2023-11-18"), ("days", 3), ("day_change", "08:00"), ("timeslot_duration", "00:15"), ("time_zone_name", "America/Montevideo"), ] rooms = [ "Auditorium", "Posgrado 1 (110)", "Posgrado 2 (111)", "401", "410", "411", ] def content_to_url(s: str) -> str: """Convert wiki link to URL.""" assert s.startswith("[[GLAM") page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_")) return "https://meta.wikimedia.org/wiki/" + page_title def minutes_to_duration(mins: int) -> str: """Convert minutes to duration string.""" return f"{mins // 60:02d}:{mins % 60:02d}" def build_event(item: Session, room: lxml.etree._Element) -> None: """Build an event.""" global event_id """Build an event element.""" if item["name"][0] != "[" or item["name"].startswith("[[Event:"): return None name: str = item["name"].partition("|")[2][:-2] event_data = [ ("start", str(item["start"])), ("duration", minutes_to_duration(item["duration"])), ("room", item["room"]), ("title", name), ("subtitle", ""), # ("track", item["track"] or ""), ("language", ""), # ("abstract", item["abstract"]), ("description", ""), ("url", content_to_url(item["name"])), ] event = lxml.etree.SubElement(room, "event", id=str(event_id)) event_id += 1 for key, value in event_data: lxml.etree.SubElement(event, key).text = value persons = lxml.etree.SubElement(event, "persons") for s in item["speakers"]: lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s def read_wikitext(filename: str) -> GroupedSlots: """Parse schedule wiki page.""" group_by_day_and_room: GroupedSlots = { index: {room: [] for room in rooms} for index in range(1, 5) } expect = "h2" # state machine start_time = None day_index = 0 col: int = 0 current: Session | None = None room: str | None = None for line in open(filename): if expect == "h2" and line.startswith("

lxml.etree._Element: """Generate the schedule XML.""" root = lxml.etree.Element("schedule") conf = lxml.etree.SubElement(root, "conference") for key, value in meta: lxml.etree.SubElement(conf, key).text = str(value) days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)] for index, d in days: day = lxml.etree.SubElement(root, "day") day.set("index", str(index)) day.set("date", d) for room_name in rooms: room = lxml.etree.SubElement(day, "room", name=room_name) for slot in grouped[index][room_name]: build_event(slot, room) return root def main(filename: str) -> None: """Parse JSON and convert to XML.""" root = generate_schedule(read_wikitext(filename)) as_xml = lxml.etree.tostring( root, xml_declaration=True, encoding="utf-8", pretty_print=True ) print(as_xml.decode("utf-8"), end="") if __name__ == "__main__": main(sys.argv[1])