Refactor add-new-conference into agenda module

This commit is contained in:
Edward Betts 2026-03-20 10:36:08 +00:00
parent 2cc25cd203
commit 76360c25f3
3 changed files with 634 additions and 455 deletions

View file

@ -0,0 +1,467 @@
"""Helpers for adding conferences to the YAML data file."""
import configparser
import json
import os
import re
import sys
import typing
from datetime import date, datetime, time, timezone
from urllib.parse import parse_qs, urlparse
import html2text
import lxml.html # type: ignore[import-untyped]
import openai
import pycountry
import requests
import yaml
USER_AGENT = "add-new-conference/0.1"
COORDINATE_PATTERNS = (
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
)
def read_api_key() -> str:
"""Read API key from ~/.config/openai/config."""
config_path = os.path.expanduser("~/.config/openai/config")
parser = configparser.ConfigParser()
parser.read(config_path)
return parser["openai"]["api_key"]
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
) -> str:
"""Build prompt with embedded YAML examples."""
examples = """
- name: Geomob London
topic: Maps
location: London
country: gb
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 23:00:00+00:00
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
venue: Geovation Hub
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
latitude: 51.5242464
longitude: -0.0997024
free: true
going: true
hashtag: '#geomobLON'
- name: DebConf 25
topic: Debian
location: Plouzané (Breast)
country: fr
start: 2025-07-07
end: 2025-07-20
url: https://wiki.debian.org/DebConf/25
going: true
cfp_url: https://debconf25.debconf.org/talks/new/
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
campus de Brest
latitude: 48.35934
longitude: -4.569889
- name: Wikimedia Hackathon
topic: Wikimedia
location: Istanbul
country: tr
start: 2025-05-02
end: 2025-05-04
venue: Renaissance Polat Istanbul Hotel
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
latitude: 40.959946
longitude: 28.838763
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
going: true
free: true
hackathon: true
registered: true
"""
coordinate_note = ""
if detected_coordinates is not None:
coordinate_note = (
"\nDetected venue coordinates from a map link on the page:\n"
f"latitude: {detected_coordinates[0]}\n"
f"longitude: {detected_coordinates[1]}\n"
)
prompt = f"""
I keep a record of interesting conferences in a YAML file.
Here are some examples of the format I use:
{examples}
Now here is a new conference of interest:
Conference URL: {url}
Return the YAML representation for this conference following the
same style and keys as the examples. Only include keys if the
information is available. Do not invent details.
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
Do not output full country names.
Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
{coordinate_note}
Wrap your answer in a JSON object with a single key "yaml".
===
{source_text}
"""
return prompt
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
"""Pass prompt to OpenAI and get reply."""
client = openai.OpenAI(api_key=read_api_key())
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
response_format={"type": "json_object"},
)
reply = response.choices[0].message.content
assert isinstance(reply, str)
return typing.cast(dict[str, str], json.loads(reply))
def fetch_webpage(url: str) -> lxml.html.HtmlElement:
"""Fetch webpage HTML and parse it."""
response = requests.get(url, headers={"User-Agent": USER_AGENT})
response.raise_for_status()
return lxml.html.fromstring(response.content)
def webpage_to_text(root: lxml.html.HtmlElement) -> str:
"""Convert parsed HTML into readable text content."""
root_copy = lxml.html.fromstring(lxml.html.tostring(root))
for script_or_style in root_copy.xpath("//script|//style"):
script_or_style.drop_tree()
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
return text_maker.handle(lxml.html.tostring(root_copy, encoding="unicode"))
def parse_osm_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from an OpenStreetMap URL."""
parsed = urlparse(url)
query = parse_qs(parsed.query)
mlat = query.get("mlat")
mlon = query.get("mlon")
if mlat and mlon:
return float(mlat[0]), float(mlon[0])
if parsed.fragment.startswith("map="):
parts = parsed.fragment.split("/")
if len(parts) >= 3:
return float(parts[-2]), float(parts[-1])
return None
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a Google Maps URL."""
for pattern in COORDINATE_PATTERNS:
match = pattern.search(url)
if match:
return float(match.group(1)), float(match.group(2))
return None
def latlon_from_google_maps_url(
url: str, timeout: int = 10
) -> tuple[float, float] | None:
"""Resolve a Google Maps URL and extract latitude/longitude."""
response = requests.get(
url,
allow_redirects=True,
timeout=timeout,
headers={"User-Agent": "lookup.py/1.0"},
)
response.raise_for_status()
coordinates = extract_google_maps_latlon(response.url)
if coordinates is not None:
return coordinates
return extract_google_maps_latlon(response.text)
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a supported map URL."""
lower_url = url.lower()
if "openstreetmap.org" in lower_url:
return parse_osm_url(url)
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
coordinates = extract_google_maps_latlon(url)
if coordinates is not None:
return coordinates
try:
return latlon_from_google_maps_url(url)
except requests.RequestException:
return None
return None
def detect_page_coordinates(root: lxml.html.HtmlElement) -> tuple[float, float] | None:
"""Detect venue coordinates from Google Maps or OSM links."""
for link in root.xpath("//a[@href]"):
href = str(link.get("href", "")).strip()
if not href:
continue
coordinates = parse_coordinates_from_url(href)
if coordinates is not None:
return coordinates
return None
def parse_date(date_str: str) -> datetime:
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
try:
dt = datetime.fromisoformat(date_str)
except ValueError:
dt = datetime.fromisoformat(date_str.split("T")[0])
if dt.tzinfo is not None:
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
def url_has_year_component(url: str) -> bool:
"""Return True if the URL contains any digit."""
return any(ch.isdigit() for ch in url)
def insert_sorted(
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> list[dict[str, typing.Any]]:
"""Insert a conference sorted by start date and skip duplicate URLs."""
new_url = new_conf.get("url")
new_start = parse_date(str(new_conf["start"]))
new_year = new_start.year
if new_url:
for conf in conferences:
if conf.get("url") == new_url:
existing_start = parse_date(str(conf["start"]))
existing_year = existing_start.year
if url_has_year_component(new_url):
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
return conferences
if existing_year == new_year:
print(
f"⚠️ Conference already exists in YAML "
f"(url={new_url}, year={existing_year}), skipping."
)
return conferences
for idx, conf in enumerate(conferences):
existing_start = parse_date(str(conf["start"]))
if new_start < existing_start:
conferences.insert(idx, new_conf)
return conferences
conferences.append(new_conf)
return conferences
def validate_country(conf: dict[str, typing.Any]) -> None:
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
country = conf.get("country")
if not country:
return
country = country.strip()
if len(country) == 2:
if pycountry.countries.get(alpha_2=country.upper()):
conf["country"] = country.lower()
return
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
match = pycountry.countries.get(name=country)
if not match:
try:
match = pycountry.countries.search_fuzzy(country)[0]
except LookupError as exc:
raise ValueError(
f"❌ Country '{country}' not recognised as ISO 3166-1"
) from exc
conf["country"] = match.alpha_2.lower()
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
"""Convert YAML date/datetime values to a datetime."""
if isinstance(value, datetime):
return value
if isinstance(value, date):
return datetime.combine(value, time())
if isinstance(value, str):
try:
return datetime.fromisoformat(value)
except ValueError:
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
return None
def same_type_as_start(
start_value: typing.Any,
new_dt: datetime,
keep_timezone: bool = True,
prefer_datetime: bool = False,
) -> typing.Any:
"""Return end value shaped like the start value when possible."""
if isinstance(start_value, datetime):
if keep_timezone:
return new_dt
return new_dt.replace(tzinfo=None)
if isinstance(start_value, date):
if prefer_datetime:
return new_dt
return new_dt.date()
if isinstance(start_value, str):
if prefer_datetime or " " in start_value or "T" in start_value:
return new_dt.isoformat(sep=" ")
return new_dt.date().isoformat()
return new_dt
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
lowered = source_text.lower()
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
return 22
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
return 23
return None
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
"""Ensure an end value exists, with a Geomob-specific fallback."""
start_value = new_conf.get("start")
if start_value is None:
return
start_dt = parse_yaml_datetime(start_value)
if start_dt is None:
return
name = str(new_conf.get("name", ""))
url = str(new_conf.get("url", ""))
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
if is_geomob:
end_hour = maybe_extract_explicit_end_time(source_text)
if end_hour is None:
end_hour = 22
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
new_conf["end"] = same_type_as_start(
start_value, geomob_end, prefer_datetime=True
)
return
if "end" not in new_conf:
new_conf["end"] = same_type_as_start(start_value, start_dt)
def load_conferences(yaml_path: str) -> list[dict[str, typing.Any]]:
"""Load conference YAML."""
with open(yaml_path) as file:
loaded = yaml.safe_load(file)
assert isinstance(loaded, list)
return typing.cast(list[dict[str, typing.Any]], loaded)
def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None:
"""Write conference YAML."""
with open(yaml_path, "w") as file:
text = yaml.dump(conferences, sort_keys=False, allow_unicode=True)
text = text.replace("\n- name:", "\n\n- name:")
file.write(text.lstrip())
def add_new_conference(url: str, yaml_path: str) -> bool:
"""Fetch, generate and insert a conference into the YAML file."""
conferences = load_conferences(yaml_path)
if url_has_year_component(url):
for conf in conferences:
if conf.get("url") == url:
print(
"⚠️ Conference already exists in YAML "
+ f"(url={url}), skipping before API call."
)
return False
soup = fetch_webpage(url)
source_text = webpage_to_text(soup)
detected_coordinates = detect_page_coordinates(soup)
prompt = build_prompt(url, source_text, detected_coordinates)
new_yaml_text = get_from_open_ai(prompt)["yaml"]
new_conf = yaml.safe_load(new_yaml_text)
if isinstance(new_conf, list):
new_conf = new_conf[0]
assert isinstance(new_conf, dict)
validate_country(new_conf)
normalise_end_field(new_conf, source_text)
if detected_coordinates is not None:
new_conf["latitude"] = detected_coordinates[0]
new_conf["longitude"] = detected_coordinates[1]
updated = insert_sorted(conferences, new_conf)
dump_conferences(yaml_path, updated)
return True
def main(argv: list[str] | None = None) -> int:
"""CLI entrypoint."""
args = argv if argv is not None else sys.argv[1:]
if not args:
raise SystemExit("Usage: add-new-conference URL")
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
add_new_conference(args[0], yaml_path)
return 0

View file

@ -1,463 +1,15 @@
#!/usr/bin/python3
import configparser
import json
import os
import re
import sys
import typing
from datetime import date, datetime, time, timezone
from urllib.parse import parse_qs, urlparse
import html2text
import openai
import pycountry
import requests
import yaml
from bs4 import BeautifulSoup
user_agent = "add-new-conference/0.1"
coordinate_patterns = (
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
)
def read_api_key() -> str:
"""Read API key from ~/.config/openai/config."""
config_path = os.path.expanduser("~/.config/openai/config")
parser = configparser.ConfigParser()
parser.read(config_path)
return parser["openai"]["api_key"]
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
) -> str:
"""Build prompt with embedded YAML examples."""
examples = """
- name: Geomob London
topic: Maps
location: London
country: gb
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 23:00:00+00:00
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
venue: Geovation Hub
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
latitude: 51.5242464
longitude: -0.0997024
free: true
going: true
hashtag: '#geomobLON'
- name: DebConf 25
topic: Debian
location: Plouzané (Breast)
country: fr
start: 2025-07-07
end: 2025-07-20
url: https://wiki.debian.org/DebConf/25
going: true
cfp_url: https://debconf25.debconf.org/talks/new/
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
campus de Brest
latitude: 48.35934
longitude: -4.569889
- name: Wikimedia Hackathon
topic: Wikimedia
location: Istanbul
country: tr
start: 2025-05-02
end: 2025-05-04
venue: Renaissance Polat Istanbul Hotel
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
latitude: 40.959946
longitude: 28.838763
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
going: true
free: true
hackathon: true
registered: true
"""
coordinate_note = ""
if detected_coordinates is not None:
coordinate_note = (
"\nDetected venue coordinates from a map link on the page:\n"
f"latitude: {detected_coordinates[0]}\n"
f"longitude: {detected_coordinates[1]}\n"
)
prompt = f"""
I keep a record of interesting conferences in a YAML file.
Here are some examples of the format I use:
{examples}
Now here is a new conference of interest:
Conference URL: {url}
Return the YAML representation for this conference following the
same style and keys as the examples. Only include keys if the
information is available. Do not invent details.
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
Do not output full country names.
Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
{coordinate_note}
Wrap your answer in a JSON object with a single key "yaml".
===
{source_text}
"""
return prompt
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
"""Pass prompt to OpenAI and get reply."""
client = openai.OpenAI(api_key=read_api_key())
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
response_format={"type": "json_object"},
)
reply = response.choices[0].message.content
assert isinstance(reply, str)
return typing.cast(dict[str, str], json.loads(reply))
def fetch_webpage(url: str) -> BeautifulSoup:
"""Fetch webpage HTML and parse it."""
response = requests.get(url, headers={"User-Agent": user_agent})
response.raise_for_status()
return BeautifulSoup(response.content, "lxml")
def webpage_to_text(soup: BeautifulSoup) -> str:
"""Convert parsed HTML into readable text content."""
soup_copy = BeautifulSoup(str(soup), "lxml")
for script_or_style in soup_copy(["script", "style"]):
script_or_style.decompose()
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
return text_maker.handle(str(soup_copy))
def parse_osm_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from an OpenStreetMap URL."""
parsed = urlparse(url)
query = parse_qs(parsed.query)
mlat = query.get("mlat")
mlon = query.get("mlon")
if mlat and mlon:
return float(mlat[0]), float(mlon[0])
if parsed.fragment.startswith("map="):
parts = parsed.fragment.split("/")
if len(parts) >= 3:
return float(parts[-2]), float(parts[-1])
return None
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a Google Maps URL."""
for pattern in coordinate_patterns:
match = pattern.search(url)
if match:
return float(match.group(1)), float(match.group(2))
return None
def latlon_from_google_maps_url(
url: str, timeout: int = 10
) -> tuple[float, float] | None:
"""Resolve a Google Maps URL and extract latitude/longitude."""
response = requests.get(
url,
allow_redirects=True,
timeout=timeout,
headers={"User-Agent": "lookup.py/1.0"},
)
response.raise_for_status()
coordinates = extract_google_maps_latlon(response.url)
if coordinates is not None:
return coordinates
return extract_google_maps_latlon(response.text)
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a supported map URL."""
lower_url = url.lower()
if "openstreetmap.org" in lower_url:
return parse_osm_url(url)
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
coordinates = extract_google_maps_latlon(url)
if coordinates is not None:
return coordinates
try:
return latlon_from_google_maps_url(url)
except requests.RequestException:
return None
return None
def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
"""Detect venue coordinates from Google Maps or OSM links."""
for link in soup.find_all("a", href=True):
href = str(link["href"]).strip()
if not href:
continue
coordinates = parse_coordinates_from_url(href)
if coordinates is not None:
return coordinates
return None
def parse_date(date_str: str) -> datetime:
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
try:
dt = datetime.fromisoformat(date_str)
except ValueError:
# fallback: just take the YYYY-MM-DD part
dt = datetime.fromisoformat(date_str.split("T")[0])
if dt.tzinfo is not None:
# normalise tz-aware datetimes to UTC, then strip tzinfo
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
def url_has_year_component(url: str) -> bool:
"""Return True if the URL contains any digit (assume year-specific)."""
return any(ch.isdigit() for ch in url)
def insert_sorted(
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> list[dict[str, typing.Any]]:
"""Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
new_url = new_conf.get("url")
new_start = parse_date(str(new_conf["start"]))
new_year = new_start.year
if new_url:
for conf in conferences:
if conf.get("url") == new_url:
existing_start = parse_date(str(conf["start"]))
existing_year = existing_start.year
if url_has_year_component(new_url):
# If URL has a year in it, treat exact URL as unique
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
return conferences
elif existing_year == new_year:
# Same URL, same year → definitely duplicate
print(
f"⚠️ Conference already exists in YAML "
f"(url={new_url}, year={existing_year}), skipping."
)
return conferences
else:
# Same URL reused for different year → allow new entry
continue
# Insert sorted by start date
for idx, conf in enumerate(conferences):
existing_start = parse_date(str(conf["start"]))
if new_start < existing_start:
conferences.insert(idx, new_conf)
return conferences
conferences.append(new_conf)
return conferences
def validate_country(conf: dict[str, typing.Any]) -> None:
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
country = conf.get("country")
if not country:
return
country = country.strip()
# Already a 2-letter code
if len(country) == 2:
if pycountry.countries.get(alpha_2=country.upper()):
conf["country"] = country.lower()
return
else:
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
# Try lookup by name
match = pycountry.countries.get(name=country)
if not match:
# fuzzy lookup (handles “United States” vs “United States of America”)
try:
match = pycountry.countries.search_fuzzy(country)[0]
except LookupError:
raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")
conf["country"] = match.alpha_2.lower()
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
"""Convert YAML date/datetime values to a datetime."""
if isinstance(value, datetime):
return value
if isinstance(value, date):
return datetime.combine(value, time())
if isinstance(value, str):
try:
return datetime.fromisoformat(value)
except ValueError:
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
return None
def same_type_as_start(
start_value: typing.Any,
new_dt: datetime,
keep_timezone: bool = True,
prefer_datetime: bool = False,
) -> typing.Any:
"""Return end value shaped like the start value when possible."""
if isinstance(start_value, datetime):
if keep_timezone:
return new_dt
return new_dt.replace(tzinfo=None)
if isinstance(start_value, date):
if prefer_datetime:
return new_dt
return new_dt.date()
if isinstance(start_value, str):
if prefer_datetime or " " in start_value or "T" in start_value:
return new_dt.isoformat(sep=" ")
return new_dt.date().isoformat()
return new_dt
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
lowered = source_text.lower()
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
return 22
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
return 23
return None
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
"""Ensure an end value exists, with a Geomob-specific fallback."""
start_value = new_conf.get("start")
if start_value is None:
return
start_dt = parse_yaml_datetime(start_value)
if start_dt is None:
return
name = str(new_conf.get("name", ""))
url = str(new_conf.get("url", ""))
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
if is_geomob:
end_hour = maybe_extract_explicit_end_time(source_text)
if end_hour is None:
end_hour = 22
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
new_conf["end"] = same_type_as_start(
start_value, geomob_end, prefer_datetime=True
)
return
if "end" not in new_conf:
new_conf["end"] = same_type_as_start(start_value, start_dt)
def main() -> None:
"""Fetch page, generate YAML via LLM, update conferences.yaml."""
url = sys.argv[1]
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
# Load conferences first
with open(yaml_path) as f:
conferences = yaml.safe_load(f)
# Early exit: if URL contains a year and already exists, skip
if url_has_year_component(url):
for conf in conferences:
if conf.get("url") == url:
print(
"⚠️ Conference already exists in YAML "
+ f"(url={url}), skipping before API call."
)
return
# Otherwise proceed with full workflow
soup = fetch_webpage(url)
source_text = webpage_to_text(soup)
detected_coordinates = detect_page_coordinates(soup)
prompt = build_prompt(url, source_text, detected_coordinates)
new_yaml_text = get_from_open_ai(prompt)["yaml"]
new_conf = yaml.safe_load(new_yaml_text)
if isinstance(new_conf, list):
new_conf = new_conf[0]
validate_country(new_conf)
normalise_end_field(new_conf, source_text)
if detected_coordinates is not None:
new_conf["latitude"] = detected_coordinates[0]
new_conf["longitude"] = detected_coordinates[1]
updated = insert_sorted(conferences, new_conf)
with open(yaml_path, "w") as f:
text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
text = text.replace("\n- name:", "\n\n- name:") # keep blank lines
f.write(text.lstrip())
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
REPO_ROOT = os.path.dirname(SCRIPT_DIR)
if REPO_ROOT not in sys.path:
sys.path.insert(0, REPO_ROOT)
from agenda.add_new_conference import main
if __name__ == "__main__":
main()
raise SystemExit(main())

View file

@ -0,0 +1,160 @@
"""Tests for agenda.add_new_conference."""
from datetime import date, datetime
import typing
import lxml.html # type: ignore[import-untyped]
import pytest
import yaml
from agenda import add_new_conference
def test_parse_osm_url_mlat_mlon() -> None:
"""OpenStreetMap URLs with mlat/mlon should parse."""
result = add_new_conference.parse_osm_url(
"https://www.openstreetmap.org/?mlat=51.5&mlon=-0.12"
)
assert result == (51.5, -0.12)
def test_extract_google_maps_latlon_at_pattern() -> None:
"""Google Maps @lat,lon URLs should parse."""
result = add_new_conference.extract_google_maps_latlon(
"https://www.google.com/maps/place/Venue/@51.5242464,-0.0997024,17z/"
)
assert result == (51.5242464, -0.0997024)
def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None:
"""The same non-year-specific URL can be reused for a different year."""
conferences: list[dict[str, typing.Any]] = [
{
"name": "OldConf",
"start": date(2025, 6, 1),
"url": "https://example.com/conf",
}
]
new_conf: dict[str, typing.Any] = {
"name": "NewConf",
"start": date(2026, 6, 1),
"url": "https://example.com/conf",
}
updated = add_new_conference.insert_sorted(conferences, new_conf)
assert len(updated) == 2
assert updated[1]["name"] == "NewConf"
def test_validate_country_normalises_name() -> None:
"""Country names should be normalised to alpha-2 codes."""
conf: dict[str, typing.Any] = {"country": "United Kingdom"}
add_new_conference.validate_country(conf)
assert conf["country"] == "gb"
def test_normalise_end_field_defaults_single_day_date() -> None:
"""Non-Geomob conferences should default end to the start date."""
conf: dict[str, typing.Any] = {
"name": "PyCon",
"start": date(2026, 4, 10),
}
add_new_conference.normalise_end_field(conf, "plain text")
assert conf["end"] == date(2026, 4, 10)
def test_normalise_end_field_sets_geomob_end_time() -> None:
"""Geomob conferences should default to a 22:00 end time."""
conf: dict[str, typing.Any] = {
"name": "Geomob London",
"start": date(2026, 1, 28),
"url": "https://thegeomob.com/post/jan-28th-2026-geomoblon-details",
}
add_new_conference.normalise_end_field(conf, "see you there")
assert conf["end"] == datetime(2026, 1, 28, 22, 0)
def test_detect_page_coordinates_uses_first_supported_link() -> None:
"""Page coordinate detection should inspect anchor hrefs."""
root = lxml.html.fromstring(
(
"<html><body>"
'<a href="https://example.com">Example</a>'
'<a href="https://www.openstreetmap.org/?mlat=51.5&mlon=-0.12">Map</a>'
"</body></html>"
)
)
assert add_new_conference.detect_page_coordinates(root) == (51.5, -0.12)
def test_add_new_conference_updates_yaml(
tmp_path: typing.Any, monkeypatch: pytest.MonkeyPatch
) -> None:
"""The end-to-end import flow should append a generated conference."""
yaml_path = tmp_path / "conferences.yaml"
yaml_path.write_text(
yaml.dump(
[
{
"name": "ExistingConf",
"start": date(2026, 4, 1),
"end": date(2026, 4, 2),
"url": "https://example.com/existing",
}
],
sort_keys=False,
)
)
root = lxml.html.fromstring(
(
"<html><body>"
'<a href="https://www.openstreetmap.org/?mlat=40.0&mlon=-74.0">Map</a>'
"</body></html>"
)
)
monkeypatch.setattr(add_new_conference, "fetch_webpage", lambda url: root)
monkeypatch.setattr(
add_new_conference,
"webpage_to_text",
lambda parsed: "Conference details",
)
monkeypatch.setattr(
add_new_conference,
"get_from_open_ai",
lambda prompt: {
"yaml": yaml.dump(
{
"name": "NewConf",
"topic": "Tech",
"location": "New York",
"country": "United States",
"start": date(2026, 5, 3),
"url": "https://example.com/newconf",
},
sort_keys=False,
)
},
)
added = add_new_conference.add_new_conference(
"https://example.com/newconf", str(yaml_path)
)
assert added is True
written = yaml.safe_load(yaml_path.read_text())
assert len(written) == 2
assert written[1]["name"] == "NewConf"
assert written[1]["country"] == "us"
assert written[1]["end"] == date(2026, 5, 3)
assert written[1]["latitude"] == 40.0
assert written[1]["longitude"] == -74.0