Initial commit
This commit is contained in:
commit
f8454fa295
274
parse.py
Executable file
274
parse.py
Executable file
|
@ -0,0 +1,274 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
import sys
|
||||
import typing
|
||||
import urllib.parse
|
||||
from datetime import time
|
||||
|
||||
import lxml.etree
|
||||
|
||||
event_id = 1
|
||||
presenter_id = 1
|
||||
|
||||
|
||||
def md5sum(s: str) -> str:
|
||||
"""Generate hex md5sum."""
|
||||
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
re_day_heading = re.compile(
|
||||
r'<h2 style="border-bottom: 0px;"><b>'
|
||||
+ r'<span style="color: #2873b3;"> .*(.*?) *</span></b></h2>'
|
||||
)
|
||||
|
||||
re_time = re.compile(r"^\| ?(\d{2}):(\d{2})")
|
||||
url = "https://meta.wikimedia.org/wiki/GLAM_Wiki_2023/Program/The_CC_Certificate_for_GLAM:_learn_about_it_by_becoming_part_of_a_human_sculpture_collection"
|
||||
|
||||
# re_session = re.compile(r'\| colspan="\d+" rowspan="\d+" \|\[\[(.*)\|(.*)\]\]')
|
||||
re_session = re.compile(r'\| (?:colspan="\d+" )?rowspan="(\d+)" \|(.*)')
|
||||
re_colspan = re.compile(r'\| colspan="(\d+)')
|
||||
|
||||
re_speaker = re.compile(r"<small>(.*)</small>")
|
||||
|
||||
|
||||
class Session(typing.TypedDict):
|
||||
"""Session."""
|
||||
|
||||
name: str
|
||||
start: time
|
||||
duration: int
|
||||
room: str
|
||||
speakers: list[str]
|
||||
|
||||
|
||||
GroupedSlots = dict[int, dict[str, list[Session]]]
|
||||
|
||||
|
||||
def content_to_url(s: str) -> str:
|
||||
try:
|
||||
assert s.startswith("[[GLAM")
|
||||
except AssertionError:
|
||||
print(s)
|
||||
raise
|
||||
page_title = urllib.parse.quote(s.partition("|")[0][2:].replace(" ", "_"))
|
||||
return "https://meta.wikimedia.org/wiki/" + page_title
|
||||
|
||||
|
||||
meta = [
|
||||
("title", "GLAM Wiki 2023"),
|
||||
("subtitle", "Galleries, Libraries, Archives, Museums, etc."),
|
||||
("venue", "University of the Republic of Uruguay"),
|
||||
("city", "Montevideo, Uruguay"),
|
||||
("start", "2023-11-16"),
|
||||
("end", "2023-11-18"),
|
||||
("days", 3),
|
||||
("day_change", "08:00"),
|
||||
("timeslot_duration", "00:15"),
|
||||
("time_zone_name", "America/Montevideo"),
|
||||
]
|
||||
|
||||
rooms = [
|
||||
"Auditorium",
|
||||
"Posgrado 1 (110)",
|
||||
"Posgrado 2 (111)",
|
||||
"401",
|
||||
"410",
|
||||
"411",
|
||||
]
|
||||
|
||||
|
||||
def minutes_to_duration(mins: int) -> str:
|
||||
"""Convert minutes to duration string."""
|
||||
return f"{mins // 60:02d}:{mins % 60:02d}"
|
||||
|
||||
|
||||
def build_event(item: Session, room: lxml.etree._Element) -> None:
|
||||
global event_id
|
||||
"""Build an event element."""
|
||||
if item["name"][0] != "[" or item["name"].startswith("[[Event:"):
|
||||
return None
|
||||
name: str = item["name"].partition("|")[2][:-2]
|
||||
event_data = [
|
||||
("start", str(item["start"])),
|
||||
("duration", minutes_to_duration(item["duration"])),
|
||||
("room", item["room"]),
|
||||
("title", name),
|
||||
("subtitle", ""),
|
||||
# ("track", item["track"] or ""),
|
||||
("language", ""),
|
||||
# ("abstract", item["abstract"]),
|
||||
("description", ""),
|
||||
("url", content_to_url(item["name"])),
|
||||
]
|
||||
|
||||
event = lxml.etree.SubElement(room, "event", id=str(event_id))
|
||||
event_id += 1
|
||||
|
||||
for key, value in event_data:
|
||||
lxml.etree.SubElement(event, key).text = value
|
||||
|
||||
persons = lxml.etree.SubElement(event, "persons")
|
||||
for s in item["speakers"]:
|
||||
lxml.etree.SubElement(persons, "person", id=md5sum(s)).text = s
|
||||
|
||||
|
||||
def read_wikitext(filename: str) -> GroupedSlots:
|
||||
group_by_day_and_room: GroupedSlots = {
|
||||
index: {room: [] for room in rooms} for index in range(1, 5)
|
||||
}
|
||||
|
||||
expect = "h2" # state machine
|
||||
start_time = None
|
||||
day_index = 0
|
||||
col: int = 0
|
||||
current: Session | None = None
|
||||
room: str | None = None
|
||||
for line in open(filename):
|
||||
if expect == "h2" and line.startswith("<h2 "):
|
||||
m = re_day_heading.match(line)
|
||||
assert m
|
||||
day_index += 1
|
||||
expect = "wikidatabe"
|
||||
continue
|
||||
if expect == "wikidatabe":
|
||||
assert line.startswith('{| class="wikitable"')
|
||||
expect = "session"
|
||||
continue
|
||||
if expect == "session" and line == "|-\n":
|
||||
expect = "time"
|
||||
continue
|
||||
if expect == "heading" and line == "|-\n":
|
||||
expect = "time"
|
||||
continue
|
||||
if expect == "time":
|
||||
if line == "|\n" or line.startswith('| rowspan="3" |'):
|
||||
expect = "heading"
|
||||
continue
|
||||
if "Detailed program" in line:
|
||||
expect = "session"
|
||||
continue
|
||||
if line[0] == "!":
|
||||
expect = "heading"
|
||||
continue
|
||||
if line == '| rowspan="3" |\n':
|
||||
expect = "heading"
|
||||
continue
|
||||
if line != "|\n":
|
||||
m = re_time.match(line)
|
||||
if not m:
|
||||
print(repr(line))
|
||||
assert m
|
||||
t = time(int(m.group(1)), int(m.group(2)))
|
||||
start_time = t
|
||||
col = 1
|
||||
# print("time: ", t)
|
||||
expect = "session"
|
||||
continue
|
||||
if expect == "session" and line == "|\n":
|
||||
col += 1
|
||||
|
||||
if (
|
||||
expect == "session"
|
||||
and "small" in line
|
||||
and (line.startswith("''") or line.startswith("<small"))
|
||||
):
|
||||
assert current and isinstance(current["speakers"], list)
|
||||
m = re_speaker.search(line)
|
||||
if not m:
|
||||
print(line)
|
||||
assert m
|
||||
speaker_name = m.group(1).strip("'")
|
||||
if speaker_name.startswith("("):
|
||||
assert current["speakers"]
|
||||
current["speakers"][-1] += " " + speaker_name
|
||||
else:
|
||||
current["speakers"].append(speaker_name)
|
||||
# print(current)
|
||||
continue
|
||||
|
||||
if (
|
||||
expect == "session"
|
||||
and line.startswith("| colspan")
|
||||
or line.startswith('| rowspan="')
|
||||
):
|
||||
room = rooms[col - 1]
|
||||
if line.startswith('| rowspan="'):
|
||||
colspan = 1
|
||||
else:
|
||||
m = re_colspan.match(line)
|
||||
assert m
|
||||
colspan = int(m.group(1))
|
||||
col += colspan
|
||||
|
||||
if re.match(r'\| rowspan="\d" \|\n', line):
|
||||
continue
|
||||
|
||||
if line == '| colspan="1" |\n':
|
||||
continue
|
||||
if 'style="background-color:#' in line:
|
||||
continue
|
||||
m = re_session.match(line)
|
||||
if not m:
|
||||
print(repr(line))
|
||||
assert m
|
||||
content = m.group(2)
|
||||
if not content:
|
||||
continue
|
||||
duration = 15 * int(m.group(1))
|
||||
|
||||
assert start_time
|
||||
# print(day_index, start_time, col, room, duration, content)
|
||||
current = {
|
||||
"name": content,
|
||||
"start": start_time,
|
||||
"duration": duration,
|
||||
"room": room,
|
||||
"speakers": [],
|
||||
}
|
||||
group_by_day_and_room[day_index][room].append(current)
|
||||
|
||||
if expect == "session" and line == "|}\n":
|
||||
expect = "h2"
|
||||
continue
|
||||
|
||||
return group_by_day_and_room
|
||||
|
||||
|
||||
def generate_schedule(grouped: GroupedSlots) -> lxml.etree._Element:
|
||||
"""Generate the schedule XML."""
|
||||
root = lxml.etree.Element("schedule")
|
||||
conf = lxml.etree.SubElement(root, "conference")
|
||||
|
||||
for key, value in meta:
|
||||
lxml.etree.SubElement(conf, key).text = str(value)
|
||||
|
||||
days = [(index, f"2023-11-{16 + index}") for index in range(1, 4)]
|
||||
for index, d in days:
|
||||
day = lxml.etree.SubElement(root, "day")
|
||||
day.set("index", str(index))
|
||||
day.set("date", d)
|
||||
|
||||
for room_name in rooms:
|
||||
room = lxml.etree.SubElement(day, "room", name=room_name)
|
||||
|
||||
for slot in grouped[index][room_name]:
|
||||
build_event(slot, room)
|
||||
|
||||
return root
|
||||
|
||||
|
||||
def main(filename: str) -> None:
|
||||
"""Parse JSON and convert to XML."""
|
||||
root = generate_schedule(read_wikitext(filename))
|
||||
|
||||
as_xml = lxml.etree.tostring(
|
||||
root, xml_declaration=True, encoding="utf-8", pretty_print=True
|
||||
)
|
||||
|
||||
print(as_xml.decode("utf-8"), end="")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1])
|
Loading…
Reference in a new issue