glam-wiki-schedule/parse.py

#!/usr/bin/python3
"""Convert from wiki page schedule to XML schedule."""

import hashlib
import re
import sys
import typing
import urllib.parse
from datetime import time

import lxml.etree

event_id = 1
presenter_id = 1


def md5sum(s: str) -> str:
    """Generate hex md5sum."""
    return hashlib.md5(s.encode("utf-8")).hexdigest()


re_day_heading = re.compile(
    r'<h2 style="border-bottom: 0px;"><b>'
    + r'<span style="color: #2873b3;"> .*(.*?) *</span></b></h2>'
)

re_time = re.compile(r"^\| ?(\d{2}):(\d{2})")

re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)')
re_colspan = re.compile(r'\| colspan="(\d+)')

re_speaker = re.compile(r"<small>(.*)</small>")


class Session(typing.TypedDict):
    """Session."""

    name: str
    start: time
    duration: int
    room: str
    speakers: list[str]


GroupedSlots = dict[int, dict[str, list[Session]]]


meta = [
    ("title", "GLAM Wiki 2023"),
    ("subtitle", "Galleries, Libraries, Archives, Museums, etc."),
    ("venue", "University of the Republic of Uruguay"),
    ("city", "Montevideo, Uruguay"),
    ("start", "2023-11-16"),
    ("end", "2023-11-18"),
    ("days", 3),
    ("day_change", "08:00"),
    ("timeslot_duration", "00:15"),
    ("time_zone_name", "America/Montevideo"),
]

rooms = [
    "Auditorium",
    "Posgrado 1 (110)",
    "Posgrado 2 (111)",
    "401",
    "410",
    "411",
]


def content_to_url(s: str) -> str:
    """Convert wiki link to URL."""
    assert s.startswith("[[GLAM")
    page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_"))
    return "https://meta.wikimedia.org/wiki/" + page_title


def minutes_to_duration(mins: int) -> str:
    """Convert minutes to duration string."""
    return f"{mins // 60:02d}:{mins % 60:02d}"


def build_event(item: Session, room: lxml.etree._Element) -> None:
    """Build an event."""
    global event_id
    """Build an event element."""
    if item["name"][0] != "[" or item["name"].startswith("[[Event:"):
        return None
    name: str = item["name"].partition("|")[2][:-2]
    event_data = [
        ("start", str(item["start"])),
        ("duration", minutes_to_duration(item["duration"])),
        ("room", item["room"]),
        ("title", name),
        ("subtitle", ""),
        # ("track", item["track"] or ""),
        ("language", ""),
        # ("abstract", item["abstract"]),
        ("description", ""),
        ("url", content_to_url(item["name"])),
    ]

    event = lxml.etree.SubElement(room, "event", id=str(event_id))
    event_id += 1

    for key, value in event_data:
        lxml.etree.SubElement(event, key).text = value

    persons = lxml.etree.SubElement(event, "persons")
    for s in item["speakers"]:
        lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s


def read_wikitext(filename: str) -> GroupedSlots:
    """Parse schedule wiki page."""
    group_by_day_and_room: GroupedSlots = {
        index: {room: [] for room in rooms} for index in range(1, 5)
    }

    expect = "h2"  # state machine
    start_time = None
    day_index = 0
    col: int = 0
    current: Session | None = None
    room: str | None = None
    for line in open(filename):
        if expect == "h2" and line.startswith("<h2 "):
            m = re_day_heading.match(line)
            assert m
            day_index += 1
            expect = "wikidatabe"
            continue
        if expect == "wikidatabe":
            assert line.startswith('{| class="wikitable"')
            expect = "session"
            continue
        if expect == "session" and line == "|-\n":
            expect = "time"
            continue
        if expect == "heading" and line == "|-\n":
            expect = "time"
            continue
        if expect == "time":
            if line == "|\n" or line.startswith('| rowspan="3" |'):
                expect = "heading"
                continue
            if "Detailed program" in line:
                expect = "session"
                continue
            if line[0] == "!":
                expect = "heading"
                continue
            if line == '| rowspan="3" |\n':
                expect = "heading"
                continue
            if line != "|\n":
                m = re_time.match(line)
                if not m:
                    print(repr(line))
                assert m
                t = time(int(m.group(1)), int(m.group(2)))
                start_time = t
                col = 1
            expect = "session"
            continue
        if expect == "session" and line == "|\n":
            col += 1

        if (
            expect == "session"
            and "small" in line
            and (line.startswith("''") or line.startswith("<small"))
        ):
            assert current and isinstance(current["speakers"], list)
            m = re_speaker.search(line)
            if not m:
                print(line)
            assert m
            speaker_name = m.group(1).strip("'")
            if speaker_name.startswith("("):
                assert current["speakers"]
                current["speakers"][-1] += " " + speaker_name
            else:
                current["speakers"].append(speaker_name)
            # print(current)
            continue

        if (
            expect == "session"
            and line.startswith("| colspan")
            or line.startswith('| rowspan="')
        ):
            room = rooms[col - 1]
            if line.startswith('| rowspan="'):
                colspan = 1
            else:
                m = re_colspan.match(line)
                assert m
                colspan = int(m.group(1))
            col += colspan

            if re.match(r'\| rowspan="\d" \|\n', line):
                continue

            if line == '| colspan="1" |\n':
                continue
            if 'style="background-color:#' in line:
                continue
            m = re_session.match(line)
            if not m:
                print(repr(line))
            assert m
            content = m.group(2)
            if not content:
                continue
            duration = 15 * int(m.group(1))

            assert start_time
            # print(day_index, start_time, col, room, duration, content)
            current = {
                "name": content,
                "start": start_time,
                "duration": duration,
                "room": room,
                "speakers": [],
            }
            group_by_day_and_room[day_index][room].append(current)

        if expect == "session" and line == "|}\n":
            expect = "h2"
            continue

    return group_by_day_and_room


def generate_schedule(grouped: GroupedSlots) -> lxml.etree._Element:
    """Generate the schedule XML."""
    root = lxml.etree.Element("schedule")
    conf = lxml.etree.SubElement(root, "conference")

    for key, value in meta:
        lxml.etree.SubElement(conf, key).text = str(value)

    days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)]
    for index, d in days:
        day = lxml.etree.SubElement(root, "day")
        day.set("index", str(index))
        day.set("date", d)

        for room_name in rooms:
            room = lxml.etree.SubElement(day, "room", name=room_name)

            for slot in grouped[index][room_name]:
                build_event(slot, room)

    return root


def main(filename: str) -> None:
    """Parse JSON and convert to XML."""
    root = generate_schedule(read_wikitext(filename))

    as_xml = lxml.etree.tostring(
        root, xml_declaration=True, encoding="utf-8", pretty_print=True
    )

    print(as_xml.decode("utf-8"), end="")


if __name__ == "__main__":
    main(sys.argv[1])