#!/usr/bin/python3 import configparser import json import os import re import sys import typing from datetime import date, datetime, time, timezone from urllib.parse import parse_qs, urlparse import html2text import openai import pycountry import requests import yaml from bs4 import BeautifulSoup user_agent = "add-new-conference/0.1" coordinate_patterns = ( re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"), re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), ) def read_api_key() -> str: """Read API key from ~/.config/openai/config.""" config_path = os.path.expanduser("~/.config/openai/config") parser = configparser.ConfigParser() parser.read(config_path) return parser["openai"]["api_key"] def build_prompt( url: str, source_text: str, detected_coordinates: tuple[float, float] | None, ) -> str: """Build prompt with embedded YAML examples.""" examples = """ - name: Geomob London topic: Maps location: London country: gb start: 2026-01-28 18:00:00+00:00 end: 2026-01-28 23:00:00+00:00 url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details venue: Geovation Hub address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN latitude: 51.5242464 longitude: -0.0997024 free: true going: true hashtag: '#geomobLON' - name: DebConf 25 topic: Debian location: Plouzané (Breast) country: fr start: 2025-07-07 end: 2025-07-20 url: https://wiki.debian.org/DebConf/25 going: true cfp_url: https://debconf25.debconf.org/talks/new/ venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire campus de Brest latitude: 48.35934 longitude: -4.569889 - name: Wikimedia Hackathon topic: Wikimedia location: Istanbul country: tr start: 2025-05-02 end: 2025-05-04 venue: Renaissance Polat Istanbul Hotel address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul latitude: 40.959946 longitude: 28.838763 url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025 going: true free: true hackathon: true registered: true """ coordinate_note = "" if detected_coordinates is not None: coordinate_note = ( "\nDetected venue coordinates from a map link on the page:\n" f"latitude: {detected_coordinates[0]}\n" f"longitude: {detected_coordinates[1]}\n" ) prompt = f""" I keep a record of interesting conferences in a YAML file. Here are some examples of the format I use: {examples} Now here is a new conference of interest: Conference URL: {url} Return the YAML representation for this conference following the same style and keys as the examples. Only include keys if the information is available. Do not invent details. Important: the `country` field must always be a valid ISO 3166-1 alpha-2 country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom). Do not output full country names. Important: always include an `end` field. If the event is a single-day event, the `end` can be the same date as `start`, or a same-day datetime if the page provides an end time. Important: if this is a Geomob event, use an `end` datetime of 22:00 local time on the event date unless the page explicitly provides a different end time. {coordinate_note} Wrap your answer in a JSON object with a single key "yaml". === {source_text} """ return prompt def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]: """Pass prompt to OpenAI and get reply.""" client = openai.OpenAI(api_key=read_api_key()) response = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model=model, response_format={"type": "json_object"}, ) reply = response.choices[0].message.content assert isinstance(reply, str) return typing.cast(dict[str, str], json.loads(reply)) def fetch_webpage(url: str) -> BeautifulSoup: """Fetch webpage HTML and parse it.""" response = requests.get(url, headers={"User-Agent": user_agent}) response.raise_for_status() return BeautifulSoup(response.content, "lxml") def webpage_to_text(soup: BeautifulSoup) -> str: """Convert parsed HTML into readable text content.""" soup_copy = BeautifulSoup(str(soup), "lxml") for script_or_style in soup_copy(["script", "style"]): script_or_style.decompose() text_maker = html2text.HTML2Text() text_maker.ignore_links = True text_maker.ignore_images = True return text_maker.handle(str(soup_copy)) def parse_osm_url(url: str) -> tuple[float, float] | None: """Extract latitude/longitude from an OpenStreetMap URL.""" parsed = urlparse(url) query = parse_qs(parsed.query) mlat = query.get("mlat") mlon = query.get("mlon") if mlat and mlon: return float(mlat[0]), float(mlon[0]) if parsed.fragment.startswith("map="): parts = parsed.fragment.split("/") if len(parts) >= 3: return float(parts[-2]), float(parts[-1]) return None def extract_google_maps_latlon(url: str) -> tuple[float, float] | None: """Extract latitude/longitude from a Google Maps URL.""" for pattern in coordinate_patterns: match = pattern.search(url) if match: return float(match.group(1)), float(match.group(2)) return None def latlon_from_google_maps_url( url: str, timeout: int = 10 ) -> tuple[float, float] | None: """Resolve a Google Maps URL and extract latitude/longitude.""" response = requests.get( url, allow_redirects=True, timeout=timeout, headers={"User-Agent": "lookup.py/1.0"}, ) response.raise_for_status() coordinates = extract_google_maps_latlon(response.url) if coordinates is not None: return coordinates return extract_google_maps_latlon(response.text) def parse_coordinates_from_url(url: str) -> tuple[float, float] | None: """Extract latitude/longitude from a supported map URL.""" lower_url = url.lower() if "openstreetmap.org" in lower_url: return parse_osm_url(url) if "google." in lower_url or "maps.app.goo.gl" in lower_url: coordinates = extract_google_maps_latlon(url) if coordinates is not None: return coordinates try: return latlon_from_google_maps_url(url) except requests.RequestException: return None return None def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None: """Detect venue coordinates from Google Maps or OSM links.""" for link in soup.find_all("a", href=True): href = str(link["href"]).strip() if not href: continue coordinates = parse_coordinates_from_url(href) if coordinates is not None: return coordinates return None def parse_date(date_str: str) -> datetime: """Parse ISO date or datetime into a naive datetime (UTC if tz-aware).""" try: dt = datetime.fromisoformat(date_str) except ValueError: # fallback: just take the YYYY-MM-DD part dt = datetime.fromisoformat(date_str.split("T")[0]) if dt.tzinfo is not None: # normalise tz-aware datetimes to UTC, then strip tzinfo dt = dt.astimezone(timezone.utc).replace(tzinfo=None) return dt def url_has_year_component(url: str) -> bool: """Return True if the URL contains any digit (assume year-specific).""" return any(ch.isdigit() for ch in url) def insert_sorted( conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any] ) -> list[dict[str, typing.Any]]: """Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness).""" new_url = new_conf.get("url") new_start = parse_date(str(new_conf["start"])) new_year = new_start.year if new_url: for conf in conferences: if conf.get("url") == new_url: existing_start = parse_date(str(conf["start"])) existing_year = existing_start.year if url_has_year_component(new_url): # If URL has a year in it, treat exact URL as unique print(f"⚠️ Conference with URL {new_url} already exists, skipping.") return conferences elif existing_year == new_year: # Same URL, same year → definitely duplicate print( f"⚠️ Conference already exists in YAML " f"(url={new_url}, year={existing_year}), skipping." ) return conferences else: # Same URL reused for different year → allow new entry continue # Insert sorted by start date for idx, conf in enumerate(conferences): existing_start = parse_date(str(conf["start"])) if new_start < existing_start: conferences.insert(idx, new_conf) return conferences conferences.append(new_conf) return conferences def validate_country(conf: dict[str, typing.Any]) -> None: """Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible.""" country = conf.get("country") if not country: return country = country.strip() # Already a 2-letter code if len(country) == 2: if pycountry.countries.get(alpha_2=country.upper()): conf["country"] = country.lower() return else: raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'") # Try lookup by name match = pycountry.countries.get(name=country) if not match: # fuzzy lookup (handles “United States” vs “United States of America”) try: match = pycountry.countries.search_fuzzy(country)[0] except LookupError: raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1") conf["country"] = match.alpha_2.lower() def parse_yaml_datetime(value: typing.Any) -> datetime | None: """Convert YAML date/datetime values to a datetime.""" if isinstance(value, datetime): return value if isinstance(value, date): return datetime.combine(value, time()) if isinstance(value, str): try: return datetime.fromisoformat(value) except ValueError: return datetime.combine(date.fromisoformat(value.split("T")[0]), time()) return None def same_type_as_start( start_value: typing.Any, new_dt: datetime, keep_timezone: bool = True, prefer_datetime: bool = False, ) -> typing.Any: """Return end value shaped like the start value when possible.""" if isinstance(start_value, datetime): if keep_timezone: return new_dt return new_dt.replace(tzinfo=None) if isinstance(start_value, date): if prefer_datetime: return new_dt return new_dt.date() if isinstance(start_value, str): if prefer_datetime or " " in start_value or "T" in start_value: return new_dt.isoformat(sep=" ") return new_dt.date().isoformat() return new_dt def maybe_extract_explicit_end_time(source_text: str) -> int | None: """Extract an explicit 12-hour clock end time for Geomob-style pages.""" lowered = source_text.lower() if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered: return 22 if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered: return 23 return None def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None: """Ensure an end value exists, with a Geomob-specific fallback.""" start_value = new_conf.get("start") if start_value is None: return start_dt = parse_yaml_datetime(start_value) if start_dt is None: return name = str(new_conf.get("name", "")) url = str(new_conf.get("url", "")) is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower() if is_geomob: end_hour = maybe_extract_explicit_end_time(source_text) if end_hour is None: end_hour = 22 geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0) new_conf["end"] = same_type_as_start( start_value, geomob_end, prefer_datetime=True ) return if "end" not in new_conf: new_conf["end"] = same_type_as_start(start_value, start_dt) def main() -> None: """Fetch page, generate YAML via LLM, update conferences.yaml.""" url = sys.argv[1] yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml") # Load conferences first with open(yaml_path) as f: conferences = yaml.safe_load(f) # Early exit: if URL contains a year and already exists, skip if url_has_year_component(url): for conf in conferences: if conf.get("url") == url: print( "⚠️ Conference already exists in YAML " + f"(url={url}), skipping before API call." ) return # Otherwise proceed with full workflow soup = fetch_webpage(url) source_text = webpage_to_text(soup) detected_coordinates = detect_page_coordinates(soup) prompt = build_prompt(url, source_text, detected_coordinates) new_yaml_text = get_from_open_ai(prompt)["yaml"] new_conf = yaml.safe_load(new_yaml_text) if isinstance(new_conf, list): new_conf = new_conf[0] validate_country(new_conf) normalise_end_field(new_conf, source_text) if detected_coordinates is not None: new_conf["latitude"] = detected_coordinates[0] new_conf["longitude"] = detected_coordinates[1] updated = insert_sorted(conferences, new_conf) with open(yaml_path, "w") as f: text = yaml.dump(updated, sort_keys=False, allow_unicode=True) text = text.replace("\n- name:", "\n\n- name:") # keep blank lines f.write(text.lstrip()) if __name__ == "__main__": main()