diff --git a/agenda/add_new_conference.py b/agenda/add_new_conference.py new file mode 100644 index 0000000..e46df3c --- /dev/null +++ b/agenda/add_new_conference.py @@ -0,0 +1,467 @@ +"""Helpers for adding conferences to the YAML data file.""" + +import configparser +import json +import os +import re +import sys +import typing +from datetime import date, datetime, time, timezone +from urllib.parse import parse_qs, urlparse + +import html2text +import lxml.html # type: ignore[import-untyped] +import openai +import pycountry +import requests +import yaml + +USER_AGENT = "add-new-conference/0.1" +COORDINATE_PATTERNS = ( + re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), + re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), + re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), + re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"), + re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), +) + + +def read_api_key() -> str: + """Read API key from ~/.config/openai/config.""" + config_path = os.path.expanduser("~/.config/openai/config") + parser = configparser.ConfigParser() + parser.read(config_path) + return parser["openai"]["api_key"] + + +def build_prompt( + url: str, + source_text: str, + detected_coordinates: tuple[float, float] | None, +) -> str: + """Build prompt with embedded YAML examples.""" + examples = """ +- name: Geomob London + topic: Maps + location: London + country: gb + start: 2026-01-28 18:00:00+00:00 + end: 2026-01-28 23:00:00+00:00 + url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details + venue: Geovation Hub + address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN + latitude: 51.5242464 + longitude: -0.0997024 + free: true + going: true + hashtag: '#geomobLON' + +- name: DebConf 25 + topic: Debian + location: Plouzané (Breast) + country: fr + start: 2025-07-07 + end: 2025-07-20 + url: https://wiki.debian.org/DebConf/25 + going: true + cfp_url: https://debconf25.debconf.org/talks/new/ + venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire + campus de Brest + latitude: 48.35934 + longitude: -4.569889 + +- name: Wikimedia Hackathon + topic: Wikimedia + location: Istanbul + country: tr + start: 2025-05-02 + end: 2025-05-04 + venue: Renaissance Polat Istanbul Hotel + address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul + latitude: 40.959946 + longitude: 28.838763 + url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025 + going: true + free: true + hackathon: true + registered: true +""" + coordinate_note = "" + if detected_coordinates is not None: + coordinate_note = ( + "\nDetected venue coordinates from a map link on the page:\n" + f"latitude: {detected_coordinates[0]}\n" + f"longitude: {detected_coordinates[1]}\n" + ) + + prompt = f""" +I keep a record of interesting conferences in a YAML file. + +Here are some examples of the format I use: + +{examples} + +Now here is a new conference of interest: + +Conference URL: {url} + +Return the YAML representation for this conference following the +same style and keys as the examples. Only include keys if the +information is available. Do not invent details. + +Important: the `country` field must always be a valid ISO 3166-1 alpha-2 +country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom). +Do not output full country names. + +Important: always include an `end` field. If the event is a single-day event, +the `end` can be the same date as `start`, or a same-day datetime if the page +provides an end time. + +Important: if this is a Geomob event, use an `end` datetime of 22:00 local time +on the event date unless the page explicitly provides a different end time. +{coordinate_note} + +Wrap your answer in a JSON object with a single key "yaml". +=== +{source_text} +""" + return prompt + + +def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]: + """Pass prompt to OpenAI and get reply.""" + client = openai.OpenAI(api_key=read_api_key()) + + response = client.chat.completions.create( + messages=[{"role": "user", "content": prompt}], + model=model, + response_format={"type": "json_object"}, + ) + + reply = response.choices[0].message.content + assert isinstance(reply, str) + return typing.cast(dict[str, str], json.loads(reply)) + + +def fetch_webpage(url: str) -> lxml.html.HtmlElement: + """Fetch webpage HTML and parse it.""" + response = requests.get(url, headers={"User-Agent": USER_AGENT}) + response.raise_for_status() + return lxml.html.fromstring(response.content) + + +def webpage_to_text(root: lxml.html.HtmlElement) -> str: + """Convert parsed HTML into readable text content.""" + root_copy = lxml.html.fromstring(lxml.html.tostring(root)) + + for script_or_style in root_copy.xpath("//script|//style"): + script_or_style.drop_tree() + + text_maker = html2text.HTML2Text() + text_maker.ignore_links = True + text_maker.ignore_images = True + return text_maker.handle(lxml.html.tostring(root_copy, encoding="unicode")) + + +def parse_osm_url(url: str) -> tuple[float, float] | None: + """Extract latitude/longitude from an OpenStreetMap URL.""" + parsed = urlparse(url) + query = parse_qs(parsed.query) + + mlat = query.get("mlat") + mlon = query.get("mlon") + if mlat and mlon: + return float(mlat[0]), float(mlon[0]) + + if parsed.fragment.startswith("map="): + parts = parsed.fragment.split("/") + if len(parts) >= 3: + return float(parts[-2]), float(parts[-1]) + + return None + + +def extract_google_maps_latlon(url: str) -> tuple[float, float] | None: + """Extract latitude/longitude from a Google Maps URL.""" + for pattern in COORDINATE_PATTERNS: + match = pattern.search(url) + if match: + return float(match.group(1)), float(match.group(2)) + + return None + + +def latlon_from_google_maps_url( + url: str, timeout: int = 10 +) -> tuple[float, float] | None: + """Resolve a Google Maps URL and extract latitude/longitude.""" + response = requests.get( + url, + allow_redirects=True, + timeout=timeout, + headers={"User-Agent": "lookup.py/1.0"}, + ) + response.raise_for_status() + + coordinates = extract_google_maps_latlon(response.url) + if coordinates is not None: + return coordinates + + return extract_google_maps_latlon(response.text) + + +def parse_coordinates_from_url(url: str) -> tuple[float, float] | None: + """Extract latitude/longitude from a supported map URL.""" + lower_url = url.lower() + + if "openstreetmap.org" in lower_url: + return parse_osm_url(url) + + if "google." in lower_url or "maps.app.goo.gl" in lower_url: + coordinates = extract_google_maps_latlon(url) + if coordinates is not None: + return coordinates + + try: + return latlon_from_google_maps_url(url) + except requests.RequestException: + return None + + return None + + +def detect_page_coordinates(root: lxml.html.HtmlElement) -> tuple[float, float] | None: + """Detect venue coordinates from Google Maps or OSM links.""" + for link in root.xpath("//a[@href]"): + href = str(link.get("href", "")).strip() + if not href: + continue + + coordinates = parse_coordinates_from_url(href) + if coordinates is not None: + return coordinates + + return None + + +def parse_date(date_str: str) -> datetime: + """Parse ISO date or datetime into a naive datetime (UTC if tz-aware).""" + try: + dt = datetime.fromisoformat(date_str) + except ValueError: + dt = datetime.fromisoformat(date_str.split("T")[0]) + + if dt.tzinfo is not None: + dt = dt.astimezone(timezone.utc).replace(tzinfo=None) + + return dt + + +def url_has_year_component(url: str) -> bool: + """Return True if the URL contains any digit.""" + return any(ch.isdigit() for ch in url) + + +def insert_sorted( + conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any] +) -> list[dict[str, typing.Any]]: + """Insert a conference sorted by start date and skip duplicate URLs.""" + new_url = new_conf.get("url") + new_start = parse_date(str(new_conf["start"])) + new_year = new_start.year + + if new_url: + for conf in conferences: + if conf.get("url") == new_url: + existing_start = parse_date(str(conf["start"])) + existing_year = existing_start.year + + if url_has_year_component(new_url): + print(f"⚠️ Conference with URL {new_url} already exists, skipping.") + return conferences + if existing_year == new_year: + print( + f"⚠️ Conference already exists in YAML " + f"(url={new_url}, year={existing_year}), skipping." + ) + return conferences + + for idx, conf in enumerate(conferences): + existing_start = parse_date(str(conf["start"])) + if new_start < existing_start: + conferences.insert(idx, new_conf) + return conferences + conferences.append(new_conf) + return conferences + + +def validate_country(conf: dict[str, typing.Any]) -> None: + """Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible.""" + country = conf.get("country") + if not country: + return + + country = country.strip() + if len(country) == 2: + if pycountry.countries.get(alpha_2=country.upper()): + conf["country"] = country.lower() + return + raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'") + + match = pycountry.countries.get(name=country) + if not match: + try: + match = pycountry.countries.search_fuzzy(country)[0] + except LookupError as exc: + raise ValueError( + f"❌ Country '{country}' not recognised as ISO 3166-1" + ) from exc + + conf["country"] = match.alpha_2.lower() + + +def parse_yaml_datetime(value: typing.Any) -> datetime | None: + """Convert YAML date/datetime values to a datetime.""" + if isinstance(value, datetime): + return value + + if isinstance(value, date): + return datetime.combine(value, time()) + + if isinstance(value, str): + try: + return datetime.fromisoformat(value) + except ValueError: + return datetime.combine(date.fromisoformat(value.split("T")[0]), time()) + + return None + + +def same_type_as_start( + start_value: typing.Any, + new_dt: datetime, + keep_timezone: bool = True, + prefer_datetime: bool = False, +) -> typing.Any: + """Return end value shaped like the start value when possible.""" + if isinstance(start_value, datetime): + if keep_timezone: + return new_dt + return new_dt.replace(tzinfo=None) + + if isinstance(start_value, date): + if prefer_datetime: + return new_dt + return new_dt.date() + + if isinstance(start_value, str): + if prefer_datetime or " " in start_value or "T" in start_value: + return new_dt.isoformat(sep=" ") + return new_dt.date().isoformat() + + return new_dt + + +def maybe_extract_explicit_end_time(source_text: str) -> int | None: + """Extract an explicit 12-hour clock end time for Geomob-style pages.""" + lowered = source_text.lower() + + if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered: + return 22 + + if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered: + return 23 + + return None + + +def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None: + """Ensure an end value exists, with a Geomob-specific fallback.""" + start_value = new_conf.get("start") + if start_value is None: + return + + start_dt = parse_yaml_datetime(start_value) + if start_dt is None: + return + + name = str(new_conf.get("name", "")) + url = str(new_conf.get("url", "")) + is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower() + + if is_geomob: + end_hour = maybe_extract_explicit_end_time(source_text) + if end_hour is None: + end_hour = 22 + + geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0) + new_conf["end"] = same_type_as_start( + start_value, geomob_end, prefer_datetime=True + ) + return + + if "end" not in new_conf: + new_conf["end"] = same_type_as_start(start_value, start_dt) + + +def load_conferences(yaml_path: str) -> list[dict[str, typing.Any]]: + """Load conference YAML.""" + with open(yaml_path) as file: + loaded = yaml.safe_load(file) + assert isinstance(loaded, list) + return typing.cast(list[dict[str, typing.Any]], loaded) + + +def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None: + """Write conference YAML.""" + with open(yaml_path, "w") as file: + text = yaml.dump(conferences, sort_keys=False, allow_unicode=True) + text = text.replace("\n- name:", "\n\n- name:") + file.write(text.lstrip()) + + +def add_new_conference(url: str, yaml_path: str) -> bool: + """Fetch, generate and insert a conference into the YAML file.""" + conferences = load_conferences(yaml_path) + + if url_has_year_component(url): + for conf in conferences: + if conf.get("url") == url: + print( + "⚠️ Conference already exists in YAML " + + f"(url={url}), skipping before API call." + ) + return False + + soup = fetch_webpage(url) + source_text = webpage_to_text(soup) + detected_coordinates = detect_page_coordinates(soup) + prompt = build_prompt(url, source_text, detected_coordinates) + new_yaml_text = get_from_open_ai(prompt)["yaml"] + + new_conf = yaml.safe_load(new_yaml_text) + if isinstance(new_conf, list): + new_conf = new_conf[0] + assert isinstance(new_conf, dict) + + validate_country(new_conf) + normalise_end_field(new_conf, source_text) + + if detected_coordinates is not None: + new_conf["latitude"] = detected_coordinates[0] + new_conf["longitude"] = detected_coordinates[1] + + updated = insert_sorted(conferences, new_conf) + dump_conferences(yaml_path, updated) + return True + + +def main(argv: list[str] | None = None) -> int: + """CLI entrypoint.""" + args = argv if argv is not None else sys.argv[1:] + if not args: + raise SystemExit("Usage: add-new-conference URL") + + yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml") + add_new_conference(args[0], yaml_path) + return 0 diff --git a/scripts/add-new-conference b/scripts/add-new-conference index 0f34d2c..0190c07 100755 --- a/scripts/add-new-conference +++ b/scripts/add-new-conference @@ -1,463 +1,15 @@ #!/usr/bin/python3 -import configparser -import json import os -import re import sys -import typing -from datetime import date, datetime, time, timezone -from urllib.parse import parse_qs, urlparse -import html2text -import openai -import pycountry -import requests -import yaml -from bs4 import BeautifulSoup - -user_agent = "add-new-conference/0.1" -coordinate_patterns = ( - re.compile(r"@(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), - re.compile(r"[?&]q=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), - re.compile(r"[?&](?:ll|center)=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), - re.compile(r"!3d(-?\d+(?:\.\d+)?)!4d(-?\d+(?:\.\d+)?)"), - re.compile(r"[?&]destination=(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)"), -) - - -def read_api_key() -> str: - """Read API key from ~/.config/openai/config.""" - config_path = os.path.expanduser("~/.config/openai/config") - parser = configparser.ConfigParser() - parser.read(config_path) - return parser["openai"]["api_key"] - - -def build_prompt( - url: str, - source_text: str, - detected_coordinates: tuple[float, float] | None, -) -> str: - """Build prompt with embedded YAML examples.""" - examples = """ -- name: Geomob London - topic: Maps - location: London - country: gb - start: 2026-01-28 18:00:00+00:00 - end: 2026-01-28 23:00:00+00:00 - url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details - venue: Geovation Hub - address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN - latitude: 51.5242464 - longitude: -0.0997024 - free: true - going: true - hashtag: '#geomobLON' - -- name: DebConf 25 - topic: Debian - location: Plouzané (Breast) - country: fr - start: 2025-07-07 - end: 2025-07-20 - url: https://wiki.debian.org/DebConf/25 - going: true - cfp_url: https://debconf25.debconf.org/talks/new/ - venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire - campus de Brest - latitude: 48.35934 - longitude: -4.569889 - -- name: Wikimedia Hackathon - topic: Wikimedia - location: Istanbul - country: tr - start: 2025-05-02 - end: 2025-05-04 - venue: Renaissance Polat Istanbul Hotel - address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul - latitude: 40.959946 - longitude: 28.838763 - url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025 - going: true - free: true - hackathon: true - registered: true -""" - coordinate_note = "" - if detected_coordinates is not None: - coordinate_note = ( - "\nDetected venue coordinates from a map link on the page:\n" - f"latitude: {detected_coordinates[0]}\n" - f"longitude: {detected_coordinates[1]}\n" - ) - - prompt = f""" -I keep a record of interesting conferences in a YAML file. - -Here are some examples of the format I use: - -{examples} - -Now here is a new conference of interest: - -Conference URL: {url} - -Return the YAML representation for this conference following the -same style and keys as the examples. Only include keys if the -information is available. Do not invent details. - -Important: the `country` field must always be a valid ISO 3166-1 alpha-2 -country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom). -Do not output full country names. - -Important: always include an `end` field. If the event is a single-day event, -the `end` can be the same date as `start`, or a same-day datetime if the page -provides an end time. - -Important: if this is a Geomob event, use an `end` datetime of 22:00 local time -on the event date unless the page explicitly provides a different end time. -{coordinate_note} - -Wrap your answer in a JSON object with a single key "yaml". -=== -{source_text} -""" - return prompt - - -def get_from_open_ai(prompt: str, model: str = "gpt-5.4") -> dict[str, str]: - """Pass prompt to OpenAI and get reply.""" - client = openai.OpenAI(api_key=read_api_key()) - - response = client.chat.completions.create( - messages=[{"role": "user", "content": prompt}], - model=model, - response_format={"type": "json_object"}, - ) - - reply = response.choices[0].message.content - assert isinstance(reply, str) - return typing.cast(dict[str, str], json.loads(reply)) - - -def fetch_webpage(url: str) -> BeautifulSoup: - """Fetch webpage HTML and parse it.""" - response = requests.get(url, headers={"User-Agent": user_agent}) - response.raise_for_status() - return BeautifulSoup(response.content, "lxml") - - -def webpage_to_text(soup: BeautifulSoup) -> str: - """Convert parsed HTML into readable text content.""" - soup_copy = BeautifulSoup(str(soup), "lxml") - - for script_or_style in soup_copy(["script", "style"]): - script_or_style.decompose() - - text_maker = html2text.HTML2Text() - text_maker.ignore_links = True - text_maker.ignore_images = True - return text_maker.handle(str(soup_copy)) - - -def parse_osm_url(url: str) -> tuple[float, float] | None: - """Extract latitude/longitude from an OpenStreetMap URL.""" - parsed = urlparse(url) - query = parse_qs(parsed.query) - - mlat = query.get("mlat") - mlon = query.get("mlon") - if mlat and mlon: - return float(mlat[0]), float(mlon[0]) - - if parsed.fragment.startswith("map="): - parts = parsed.fragment.split("/") - if len(parts) >= 3: - return float(parts[-2]), float(parts[-1]) - - return None - - -def extract_google_maps_latlon(url: str) -> tuple[float, float] | None: - """Extract latitude/longitude from a Google Maps URL.""" - for pattern in coordinate_patterns: - match = pattern.search(url) - if match: - return float(match.group(1)), float(match.group(2)) - - return None - - -def latlon_from_google_maps_url( - url: str, timeout: int = 10 -) -> tuple[float, float] | None: - """Resolve a Google Maps URL and extract latitude/longitude.""" - response = requests.get( - url, - allow_redirects=True, - timeout=timeout, - headers={"User-Agent": "lookup.py/1.0"}, - ) - response.raise_for_status() - - coordinates = extract_google_maps_latlon(response.url) - if coordinates is not None: - return coordinates - - return extract_google_maps_latlon(response.text) - - -def parse_coordinates_from_url(url: str) -> tuple[float, float] | None: - """Extract latitude/longitude from a supported map URL.""" - lower_url = url.lower() - - if "openstreetmap.org" in lower_url: - return parse_osm_url(url) - - if "google." in lower_url or "maps.app.goo.gl" in lower_url: - coordinates = extract_google_maps_latlon(url) - if coordinates is not None: - return coordinates - - try: - return latlon_from_google_maps_url(url) - except requests.RequestException: - return None - - return None - - -def detect_page_coordinates(soup: BeautifulSoup) -> tuple[float, float] | None: - """Detect venue coordinates from Google Maps or OSM links.""" - for link in soup.find_all("a", href=True): - href = str(link["href"]).strip() - if not href: - continue - - coordinates = parse_coordinates_from_url(href) - if coordinates is not None: - return coordinates - - return None - - -def parse_date(date_str: str) -> datetime: - """Parse ISO date or datetime into a naive datetime (UTC if tz-aware).""" - try: - dt = datetime.fromisoformat(date_str) - except ValueError: - # fallback: just take the YYYY-MM-DD part - dt = datetime.fromisoformat(date_str.split("T")[0]) - - if dt.tzinfo is not None: - # normalise tz-aware datetimes to UTC, then strip tzinfo - dt = dt.astimezone(timezone.utc).replace(tzinfo=None) - - return dt - - -def url_has_year_component(url: str) -> bool: - """Return True if the URL contains any digit (assume year-specific).""" - return any(ch.isdigit() for ch in url) - - -def insert_sorted( - conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any] -) -> list[dict[str, typing.Any]]: - """Insert new_conf into conferences sorted by start date, skip if duplicate URL (with year awareness).""" - new_url = new_conf.get("url") - new_start = parse_date(str(new_conf["start"])) - new_year = new_start.year - - if new_url: - for conf in conferences: - if conf.get("url") == new_url: - existing_start = parse_date(str(conf["start"])) - existing_year = existing_start.year - - if url_has_year_component(new_url): - # If URL has a year in it, treat exact URL as unique - print(f"⚠️ Conference with URL {new_url} already exists, skipping.") - return conferences - elif existing_year == new_year: - # Same URL, same year → definitely duplicate - print( - f"⚠️ Conference already exists in YAML " - f"(url={new_url}, year={existing_year}), skipping." - ) - return conferences - else: - # Same URL reused for different year → allow new entry - continue - - # Insert sorted by start date - for idx, conf in enumerate(conferences): - existing_start = parse_date(str(conf["start"])) - if new_start < existing_start: - conferences.insert(idx, new_conf) - return conferences - conferences.append(new_conf) - return conferences - - -def validate_country(conf: dict[str, typing.Any]) -> None: - """Ensure country is a valid ISO 3166-1 alpha-2 code, normalise if possible.""" - country = conf.get("country") - if not country: - return - - country = country.strip() - # Already a 2-letter code - if len(country) == 2: - if pycountry.countries.get(alpha_2=country.upper()): - conf["country"] = country.lower() - return - else: - raise ValueError(f"❌ Invalid ISO 3166-1 code '{country}'") - - # Try lookup by name - match = pycountry.countries.get(name=country) - if not match: - # fuzzy lookup (handles “United States” vs “United States of America”) - try: - match = pycountry.countries.search_fuzzy(country)[0] - except LookupError: - raise ValueError(f"❌ Country '{country}' not recognised as ISO 3166-1") - - conf["country"] = match.alpha_2.lower() - - -def parse_yaml_datetime(value: typing.Any) -> datetime | None: - """Convert YAML date/datetime values to a datetime.""" - if isinstance(value, datetime): - return value - - if isinstance(value, date): - return datetime.combine(value, time()) - - if isinstance(value, str): - try: - return datetime.fromisoformat(value) - except ValueError: - return datetime.combine(date.fromisoformat(value.split("T")[0]), time()) - - return None - - -def same_type_as_start( - start_value: typing.Any, - new_dt: datetime, - keep_timezone: bool = True, - prefer_datetime: bool = False, -) -> typing.Any: - """Return end value shaped like the start value when possible.""" - if isinstance(start_value, datetime): - if keep_timezone: - return new_dt - return new_dt.replace(tzinfo=None) - - if isinstance(start_value, date): - if prefer_datetime: - return new_dt - return new_dt.date() - - if isinstance(start_value, str): - if prefer_datetime or " " in start_value or "T" in start_value: - return new_dt.isoformat(sep=" ") - return new_dt.date().isoformat() - - return new_dt - - -def maybe_extract_explicit_end_time(source_text: str) -> int | None: - """Extract an explicit 12-hour clock end time for Geomob-style pages.""" - lowered = source_text.lower() - - if "10pm" in lowered or "10 pm" in lowered or "22:00" in lowered: - return 22 - - if "11pm" in lowered or "11 pm" in lowered or "23:00" in lowered: - return 23 - - return None - - -def normalise_end_field(new_conf: dict[str, typing.Any], source_text: str) -> None: - """Ensure an end value exists, with a Geomob-specific fallback.""" - start_value = new_conf.get("start") - if start_value is None: - return - - start_dt = parse_yaml_datetime(start_value) - if start_dt is None: - return - - name = str(new_conf.get("name", "")) - url = str(new_conf.get("url", "")) - is_geomob = "geomob" in name.lower() or "thegeomob.com" in url.lower() - - if is_geomob: - end_hour = maybe_extract_explicit_end_time(source_text) - if end_hour is None: - end_hour = 22 - - geomob_end = start_dt.replace(hour=end_hour, minute=0, second=0, microsecond=0) - new_conf["end"] = same_type_as_start( - start_value, geomob_end, prefer_datetime=True - ) - return - - if "end" not in new_conf: - new_conf["end"] = same_type_as_start(start_value, start_dt) - - -def main() -> None: - """Fetch page, generate YAML via LLM, update conferences.yaml.""" - url = sys.argv[1] - yaml_path = os.path.expanduser("~/src/personal-data/conferences.yaml") - - # Load conferences first - with open(yaml_path) as f: - conferences = yaml.safe_load(f) - - # Early exit: if URL contains a year and already exists, skip - if url_has_year_component(url): - for conf in conferences: - if conf.get("url") == url: - print( - "⚠️ Conference already exists in YAML " - + f"(url={url}), skipping before API call." - ) - return - - # Otherwise proceed with full workflow - soup = fetch_webpage(url) - source_text = webpage_to_text(soup) - detected_coordinates = detect_page_coordinates(soup) - prompt = build_prompt(url, source_text, detected_coordinates) - new_yaml_text = get_from_open_ai(prompt)["yaml"] - - new_conf = yaml.safe_load(new_yaml_text) - if isinstance(new_conf, list): - new_conf = new_conf[0] - - validate_country(new_conf) - normalise_end_field(new_conf, source_text) - - if detected_coordinates is not None: - new_conf["latitude"] = detected_coordinates[0] - new_conf["longitude"] = detected_coordinates[1] - - updated = insert_sorted(conferences, new_conf) - - with open(yaml_path, "w") as f: - text = yaml.dump(updated, sort_keys=False, allow_unicode=True) - text = text.replace("\n- name:", "\n\n- name:") # keep blank lines - f.write(text.lstrip()) +SCRIPT_PATH = os.path.realpath(__file__) +SCRIPT_DIR = os.path.dirname(SCRIPT_PATH) +REPO_ROOT = os.path.dirname(SCRIPT_DIR) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) +from agenda.add_new_conference import main if __name__ == "__main__": - main() + raise SystemExit(main()) diff --git a/tests/test_add_new_conference.py b/tests/test_add_new_conference.py new file mode 100644 index 0000000..6a7117b --- /dev/null +++ b/tests/test_add_new_conference.py @@ -0,0 +1,160 @@ +"""Tests for agenda.add_new_conference.""" + +from datetime import date, datetime +import typing + +import lxml.html # type: ignore[import-untyped] +import pytest +import yaml + +from agenda import add_new_conference + + +def test_parse_osm_url_mlat_mlon() -> None: + """OpenStreetMap URLs with mlat/mlon should parse.""" + result = add_new_conference.parse_osm_url( + "https://www.openstreetmap.org/?mlat=51.5&mlon=-0.12" + ) + assert result == (51.5, -0.12) + + +def test_extract_google_maps_latlon_at_pattern() -> None: + """Google Maps @lat,lon URLs should parse.""" + result = add_new_conference.extract_google_maps_latlon( + "https://www.google.com/maps/place/Venue/@51.5242464,-0.0997024,17z/" + ) + assert result == (51.5242464, -0.0997024) + + +def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None: + """The same non-year-specific URL can be reused for a different year.""" + conferences: list[dict[str, typing.Any]] = [ + { + "name": "OldConf", + "start": date(2025, 6, 1), + "url": "https://example.com/conf", + } + ] + new_conf: dict[str, typing.Any] = { + "name": "NewConf", + "start": date(2026, 6, 1), + "url": "https://example.com/conf", + } + + updated = add_new_conference.insert_sorted(conferences, new_conf) + + assert len(updated) == 2 + assert updated[1]["name"] == "NewConf" + + +def test_validate_country_normalises_name() -> None: + """Country names should be normalised to alpha-2 codes.""" + conf: dict[str, typing.Any] = {"country": "United Kingdom"} + + add_new_conference.validate_country(conf) + + assert conf["country"] == "gb" + + +def test_normalise_end_field_defaults_single_day_date() -> None: + """Non-Geomob conferences should default end to the start date.""" + conf: dict[str, typing.Any] = { + "name": "PyCon", + "start": date(2026, 4, 10), + } + + add_new_conference.normalise_end_field(conf, "plain text") + + assert conf["end"] == date(2026, 4, 10) + + +def test_normalise_end_field_sets_geomob_end_time() -> None: + """Geomob conferences should default to a 22:00 end time.""" + conf: dict[str, typing.Any] = { + "name": "Geomob London", + "start": date(2026, 1, 28), + "url": "https://thegeomob.com/post/jan-28th-2026-geomoblon-details", + } + + add_new_conference.normalise_end_field(conf, "see you there") + + assert conf["end"] == datetime(2026, 1, 28, 22, 0) + + +def test_detect_page_coordinates_uses_first_supported_link() -> None: + """Page coordinate detection should inspect anchor hrefs.""" + root = lxml.html.fromstring( + ( + "
" + 'Example' + 'Map' + "" + ) + ) + + assert add_new_conference.detect_page_coordinates(root) == (51.5, -0.12) + + +def test_add_new_conference_updates_yaml( + tmp_path: typing.Any, monkeypatch: pytest.MonkeyPatch +) -> None: + """The end-to-end import flow should append a generated conference.""" + yaml_path = tmp_path / "conferences.yaml" + yaml_path.write_text( + yaml.dump( + [ + { + "name": "ExistingConf", + "start": date(2026, 4, 1), + "end": date(2026, 4, 2), + "url": "https://example.com/existing", + } + ], + sort_keys=False, + ) + ) + + root = lxml.html.fromstring( + ( + "" + 'Map' + "" + ) + ) + + monkeypatch.setattr(add_new_conference, "fetch_webpage", lambda url: root) + monkeypatch.setattr( + add_new_conference, + "webpage_to_text", + lambda parsed: "Conference details", + ) + monkeypatch.setattr( + add_new_conference, + "get_from_open_ai", + lambda prompt: { + "yaml": yaml.dump( + { + "name": "NewConf", + "topic": "Tech", + "location": "New York", + "country": "United States", + "start": date(2026, 5, 3), + "url": "https://example.com/newconf", + }, + sort_keys=False, + ) + }, + ) + + added = add_new_conference.add_new_conference( + "https://example.com/newconf", str(yaml_path) + ) + + assert added is True + written = yaml.safe_load(yaml_path.read_text()) + assert len(written) == 2 + assert written[1]["name"] == "NewConf" + assert written[1]["country"] == "us" + assert written[1]["end"] == date(2026, 5, 3) + assert written[1]["latitude"] == 40.0 + assert written[1]["longitude"] == -74.0