Refactor add-new-conference into agenda module

This commit is contained in:
Edward Betts 2026-03-20 10:36:08 +00:00
parent 2cc25cd203
commit 76360c25f3
3 changed files with 634 additions and 455 deletions

View file

@ -1,463 +1,15 @@
#!/usr/bin/python3
import configparser
import json
import os
import re
import sys
import typing
from datetime import date, datetime, time, timezone
from urllib.parse import parse_qs, urlparse
import html2text
import openai
import pycountry
import requests
import yaml
from bs4 import BeautifulSoup
user_agent = "add-new-conference/0.1"
coordinate_patterns = (
re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"),
re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"),
)
def read_api_key() -> str:
"""Read API key from ~/.config/openai/config."""
config_path = os.path.expanduser("~/.config/openai/config")
parser = configparser.ConfigParser()
parser.read(config_path)
return parser["openai"]["api_key"]
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
) -> str:
"""Build prompt with embedded YAML examples."""
examples = """
- name: Geomob London
topic: Maps
location: London
country: gb
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 23:00:00+00:00
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
venue: Geovation Hub
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
latitude: 51.5242464
longitude: -0.0997024
free: true
going: true
hashtag: '#geomobLON'
- name: DebConf 25
topic: Debian
location: Plouzané (Breast)
country: fr
start: 2025-07-07
end: 2025-07-20
url: https://wiki.debian.org/DebConf/25
going: true
cfp_url: https://debconf25.debconf.org/talks/new/
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
campus de Brest
latitude: 48.35934
longitude: -4.569889
- name: Wikimedia Hackathon
topic: Wikimedia
location: Istanbul
country: tr
start: 2025-05-02
end: 2025-05-04
venue: Renaissance Polat Istanbul Hotel
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
latitude: 40.959946
longitude: 28.838763
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
going: true
free: true
hackathon: true
registered: true
"""
coordinate_note = ""
if detected_coordinates is not None:
coordinate_note = (
"\nDetected venue coordinates from a map link on the page:\n"
f"latitude: {detected_coordinates[0]}\n"
f"longitude: {detected_coordinates[1]}\n"
)
prompt = f"""
I keep a record of interesting conferences in a YAML file.
Here are some examples of the format I use:
{examples}
Now here is a new conference of interest:
Conference URL: {url}
Return the YAML representation for this conference following the
same style and keys as the examples. Only include keys if the
information is available. Do not invent details.
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
Do not output full country names.
Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
{coordinate_note}
Wrap your answer in a JSON object with a single key "yaml".
===
{source_text}
"""
return prompt
def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]:
"""Pass prompt to OpenAI and get reply."""
client = openai.OpenAI(api_key=read_api_key())
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
response_format={"type": "json_object"},
)
reply = response.choices[0].message.content
assert isinstance(reply, str)
return typing.cast(dict[str, str], json.loads(reply))
def fetch_webpage(url: str) -> BeautifulSoup:
"""Fetch webpage HTML and parse it."""
response = requests.get(url, headers={"User-Agent": user_agent})
response.raise_for_status()
return BeautifulSoup(response.content, "lxml")
def webpage_to_text(soup: BeautifulSoup) -> str:
"""Convert parsed HTML into readable text content."""
soup_copy = BeautifulSoup(str(soup), "lxml")
for script_or_style in soup_copy(["script", "style"]):
script_or_style.decompose()
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
return text_maker.handle(str(soup_copy))
def parse_osm_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from an OpenStreetMap URL."""
parsed = urlparse(url)
query = parse_qs(parsed.query)
mlat = query.get("mlat")
mlon = query.get("mlon")
if mlat and mlon:
return float(mlat[0]), float(mlon[0])
if parsed.fragment.startswith("map="):
parts = parsed.fragment.split("/")
if len(parts) >= 3:
return float(parts[-2]), float(parts[-1])
return None
def extract_google_maps_latlon(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a Google Maps URL."""
for pattern in coordinate_patterns:
match = pattern.search(url)
if match:
return float(match.group(1)), float(match.group(2))
return None
def latlon_from_google_maps_url(
url: str, timeout: int = 10
) -> tuple[float, float] | None:
"""Resolve a Google Maps URL and extract latitude/longitude."""
response = requests.get(
url,
allow_redirects=True,
timeout=timeout,
headers={"User-Agent": "lookup.py/1.0"},
)
response.raise_for_status()
coordinates = extract_google_maps_latlon(response.url)
if coordinates is not None:
return coordinates
return extract_google_maps_latlon(response.text)
def parse_coordinates_from_url(url: str) -> tuple[float, float] | None:
"""Extract latitude/longitude from a supported map URL."""
lower_url = url.lower()
if "openstreetmap.org" in lower_url:
return parse_osm_url(url)
if "google." in lower_url or "maps.app.goo.gl" in lower_url:
coordinates = extract_google_maps_latlon(url)
if coordinates is not None:
return coordinates
try:
return latlon_from_google_maps_url(url)
except requests.RequestException:
return None
return None
def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None:
"""Detect venue coordinates from Google Maps or OSM links."""
for link in soup.find_all("a", href=True):
href = str(link["href"]).strip()
if not href:
continue
coordinates = parse_coordinates_from_url(href)
if coordinates is not None:
return coordinates
return None
def parse_date(date_str: str) -> datetime:
"""Parse ISO date or datetime into a naive datetime (UTC if tz-aware)."""
try:
dt = datetime.fromisoformat(date_str)
except ValueError:
# fallback: just take the YYYY-MM-DD part
dt = datetime.fromisoformat(date_str.split("T")[0])
if dt.tzinfo is not None:
# normalise tz-aware datetimes to UTC, then strip tzinfo
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
def url_has_year_component(url: str) -> bool:
"""Return True if the URL contains any digit (assume year-specific)."""
return any(ch.isdigit() for ch in url)
def insert_sorted(
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> list[dict[str, typing.Any]]:
"""Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness)."""
new_url = new_conf.get("url")
new_start = parse_date(str(new_conf["start"]))
new_year = new_start.year
if new_url:
for conf in conferences:
if conf.get("url") == new_url:
existing_start = parse_date(str(conf["start"]))
existing_year = existing_start.year
if url_has_year_component(new_url):
# If URL has a year in it, treat exact URL as unique
print(f"⚠️ Conference with URL {new_url} already exists, skipping.")
return conferences
elif existing_year == new_year:
# Same URL, same year → definitely duplicate
print(
f"⚠️ Conference already exists in YAML "
f"(url={new_url}, year={existing_year}), skipping."
)
return conferences
else:
# Same URL reused for different year → allow new entry
continue
# Insert sorted by start date
for idx, conf in enumerate(conferences):
existing_start = parse_date(str(conf["start"]))
if new_start < existing_start:
conferences.insert(idx, new_conf)
return conferences
conferences.append(new_conf)
return conferences
def validate_country(conf: dict[str, typing.Any]) -> None:
"""Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible."""
country = conf.get("country")
if not country:
return
country = country.strip()
# Already a 2-letter code
if len(country) == 2:
if pycountry.countries.get(alpha_2=country.upper()):
conf["country"] = country.lower()
return
else:
raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'")
# Try lookup by name
match = pycountry.countries.get(name=country)
if not match:
# fuzzy lookup (handles “United States” vs “United States of America”)
try:
match = pycountry.countries.search_fuzzy(country)[0]
except LookupError:
raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1")
conf["country"] = match.alpha_2.lower()
def parse_yaml_datetime(value: typing.Any) -> datetime | None:
"""Convert YAML date/datetime values to a datetime."""
if isinstance(value, datetime):
return value
if isinstance(value, date):
return datetime.combine(value, time())
if isinstance(value, str):
try:
return datetime.fromisoformat(value)
except ValueError:
return datetime.combine(date.fromisoformat(value.split("T")[0]), time())
return None
def same_type_as_start(
start_value: typing.Any,
new_dt: datetime,
keep_timezone: bool = True,
prefer_datetime: bool = False,
) -> typing.Any:
"""Return end value shaped like the start value when possible."""
if isinstance(start_value, datetime):
if keep_timezone:
return new_dt
return new_dt.replace(tzinfo=None)
if isinstance(start_value, date):
if prefer_datetime:
return new_dt
return new_dt.date()
if isinstance(start_value, str):
if prefer_datetime or " " in start_value or "T" in start_value:
return new_dt.isoformat(sep=" ")
return new_dt.date().isoformat()
return new_dt
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
lowered = source_text.lower()
if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered:
return 22
if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered:
return 23
return None
def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None:
"""Ensure an end value exists, with a Geomob-specific fallback."""
start_value = new_conf.get("start")
if start_value is None:
return
start_dt = parse_yaml_datetime(start_value)
if start_dt is None:
return
name = str(new_conf.get("name", ""))
url = str(new_conf.get("url", ""))
is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower()
if is_geomob:
end_hour = maybe_extract_explicit_end_time(source_text)
if end_hour is None:
end_hour = 22
geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0)
new_conf["end"] = same_type_as_start(
start_value, geomob_end, prefer_datetime=True
)
return
if "end" not in new_conf:
new_conf["end"] = same_type_as_start(start_value, start_dt)
def main() -> None:
"""Fetch page, generate YAML via LLM, update conferences.yaml."""
url = sys.argv[1]
yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml")
# Load conferences first
with open(yaml_path) as f:
conferences = yaml.safe_load(f)
# Early exit: if URL contains a year and already exists, skip
if url_has_year_component(url):
for conf in conferences:
if conf.get("url") == url:
print(
"⚠️ Conference already exists in YAML "
+ f"(url={url}), skipping before API call."
)
return
# Otherwise proceed with full workflow
soup = fetch_webpage(url)
source_text = webpage_to_text(soup)
detected_coordinates = detect_page_coordinates(soup)
prompt = build_prompt(url, source_text, detected_coordinates)
new_yaml_text = get_from_open_ai(prompt)["yaml"]
new_conf = yaml.safe_load(new_yaml_text)
if isinstance(new_conf, list):
new_conf = new_conf[0]
validate_country(new_conf)
normalise_end_field(new_conf, source_text)
if detected_coordinates is not None:
new_conf["latitude"] = detected_coordinates[0]
new_conf["longitude"] = detected_coordinates[1]
updated = insert_sorted(conferences, new_conf)
with open(yaml_path, "w") as f:
text = yaml.dump(updated, sort_keys=False, allow_unicode=True)
text = text.replace("\n- name:", "\n\n- name:") # keep blank lines
f.write(text.lstrip())
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
REPO_ROOT = os.path.dirname(SCRIPT_DIR)
if REPO_ROOT not in sys.path:
sys.path.insert(0, REPO_ROOT)
from agenda.add_new_conference import main
if __name__ == "__main__":
main()
raise SystemExit(main())