diff --git a/agenda/add_new_conference.py b/agenda/add_new_conference.py index 71c12ce..1a6820f 100644 --- a/agenda/add_new_conference.py +++ b/agenda/add_new_conference.py @@ -16,7 +16,7 @@ import pycountry import requests import yaml -from agenda.conference import conference_date_fields +from agenda.conference import ConferenceSeries, conference_date_fields, load_series USER_AGENT = "add-new-conference/0.1" COORDINATE_PATTERNS = ( @@ -36,58 +36,139 @@ def read_api_key() -> str: return parser["openai"]["api_key"] -def build_prompt( - url: str, - source_text: str, - detected_coordinates: tuple[float, float] | None, -) -> str: - """Build prompt with embedded YAML examples.""" - examples = """ +def conference_yaml_format_description() -> str: + """Return the conference YAML format description for LLM prompts.""" + return """ +Use this YAML format for one conference entry. + +Required fields: +- `name`: event name. +- `topic`: topic/category. +- `location`: city or location label. Use `TBC` if the page confirms a future + event but not a city. +- Date information in nested `dates`. + +Preferred date shape: +- `dates.status`: one of `exact`, `tentative`, or `approximate`. +- For `exact`: use when the page confirms specific dates/times. Include + `dates.start` and `dates.end` as YAML dates or timezone-aware datetimes. +- For `tentative`: use when specific dates are guessed or explicitly + unconfirmed. Include `dates.start`, `dates.end`, and preferably `dates.label` + and `dates.basis`. +- For `approximate`: use when only a broad date phrase is known. Include + `dates.label`, `dates.earliest`, and `dates.latest`. Examples: `March 2027` + should become earliest `2027-03-01`, latest `2027-03-31`; `mid-April 2027` + should become a sensible bounded range such as `2027-04-11` to `2027-04-20`. + +Important date rule: +- If the source page contains exact dates, output `dates.status: exact` even if + the existing agenda entry or conference announcement previously had only + approximate dates. +- Always include an end date for `exact` and `tentative`. For a single-day + event, `dates.end` can be the same as `dates.start`. +- Do not output legacy top-level `start`, `end`, or `date_status`. + +Common optional fields: +- `series`: a key from the known conference series list, when this event belongs + to a listed series. +- `country`: valid ISO 3166-1 alpha-2 country code in lowercase, for example + `ca`, `gb`, `us`. Do not output country names. +- `venue`, `address`, `latitude`, `longitude`, `url`, `cfp_url`, `cfp_end`, + `hashtag`, `description`. +- `free`, `price`, `currency`, `hackathon`, `online`, `attendees`. +- Do not include `going`, `registered`, `accommodation_booked`, + `transport_booked`, or `trip` unless the source explicitly says they apply to + my attendance. +""" + + +def yaml_example_text() -> str: + """Return examples of the conference YAML format.""" + return """ - name: Geomob London + series: geomob-london topic: Maps location: London country: gb - start: 2026-01-28 18:00:00+00:00 - end: 2026-01-28 23:00:00+00:00 + dates: + status: exact + start: 2026-01-28 18:00:00+00:00 + end: 2026-01-28 22:00:00+00:00 url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details venue: Geovation Hub address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN latitude: 51.5242464 longitude: -0.0997024 free: true - going: true hashtag: '#geomobLON' - name: DebConf 25 + series: debconf topic: Debian - location: Plouzané (Breast) + location: Plouzane country: fr - start: 2025-07-07 - end: 2025-07-20 + dates: + status: exact + start: 2025-07-07 + end: 2025-07-20 url: https://wiki.debian.org/DebConf/25 - going: true cfp_url: https://debconf25.debconf.org/talks/new/ - venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire + venue: Ecole nationale superieure Mines-Telecom Atlantique Bretagne Pays de la Loire campus de Brest latitude: 48.35934 longitude: -4.569889 - name: Wikimedia Hackathon + series: wikimedia-hackathon topic: Wikimedia - location: Istanbul - country: tr - start: 2025-05-02 - end: 2025-05-04 - venue: Renaissance Polat Istanbul Hotel - address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul - latitude: 40.959946 - longitude: 28.838763 - url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025 - going: true - free: true + location: Albania + country: al + dates: + status: approximate + label: mid-April 2027 + earliest: 2027-04-11 + latest: 2027-04-20 + url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2027 hackathon: true - registered: true + +- name: PyCascades + series: pycascades + topic: Python + location: Seattle, Washington + country: us + dates: + status: approximate + label: March 2027 + earliest: 2027-03-01 + latest: 2027-03-31 """ + + +def series_prompt_text(series: dict[str, ConferenceSeries]) -> str: + """Return compact known series text for the LLM prompt.""" + if not series: + return "No known conference series loaded." + + lines = ["Known conference series IDs:"] + for series_id, item in sorted(series.items()): + details = [item["name"]] + if topic := item.get("topic"): + details.append(f"topic: {topic}") + if location := item.get("usual_location"): + details.append(f"usual location: {location}") + if country := item.get("country"): + details.append(f"country: {country}") + lines.append(f"- {series_id}: " + "; ".join(details)) + return "\n".join(lines) + + +def build_prompt( + url: str, + source_text: str, + detected_coordinates: tuple[float, float] | None, + series: dict[str, ConferenceSeries] | None = None, +) -> str: + """Build prompt with embedded YAML format details and examples.""" coordinate_note = "" if detected_coordinates is not None: coordinate_note = ( @@ -99,28 +180,26 @@ def build_prompt( prompt = f""" I keep a record of interesting conferences in a YAML file. +Format rules: + +{conference_yaml_format_description()} + +{series_prompt_text(series or {})} + Here are some examples of the format I use: -{examples} +{yaml_example_text()} Now here is a new conference of interest: Conference URL: {url} -Return the YAML representation for this conference following the -same style and keys as the examples. Only include keys if the -information is available. Do not invent details. +Return the YAML representation for this conference following the same style and +keys as the examples. Only include keys if the information is available. Do not +invent details. -Important: the `country` field must always be a valid ISO 3166-1 alpha-2 -country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom). -Do not output full country names. - -Important: always include an `end` field. If the event is a single-day event, -the `end` can be the same date as `start`, or a same-day datetime if the page -provides an end time. - -Important: if this is a Geomob event, use an `end` datetime of 22:00 local time -on the event date unless the page explicitly provides a different end time. +Important: if this is a Geomob event, use a `dates.end` datetime of 22:00 local +time on the event date unless the page explicitly provides a different end time. {coordinate_note} Wrap your answer in a JSON object with a single key "yaml". @@ -261,9 +340,26 @@ def parse_date(date_str: str) -> datetime: return dt +def data_dir_from_conferences_path(yaml_path: str) -> str: + """Return personal-data directory from a conferences.yaml path.""" + return os.path.dirname(os.path.abspath(yaml_path)) + + def url_has_year_component(url: str) -> bool: - """Return True if the URL contains any digit.""" - return any(ch.isdigit() for ch in url) + """Return True if the URL contains a year or edition path component.""" + parsed = urlparse(url) + components = [part for part in parsed.path.split("/") if part] + if parsed.netloc: + components.extend(part for part in parsed.netloc.split(".") if part) + + for component in components: + if re.fullmatch(r"20\d{2}", component): + return True + if re.search(r"(?:^|[-_/])20\d{2}(?:$|[-_/])", component): + return True + if re.fullmatch(r"\d{1,2}x", component, flags=re.IGNORECASE): + return True + return False def insert_sorted( @@ -273,6 +369,13 @@ def insert_sorted( new_url = new_conf.get("url") new_start = conference_sort_datetime(new_conf) new_year = new_start.year + update_idx = find_inexact_existing_conference(conferences, new_conf) + if update_idx is not None: + existing = conferences.pop(update_idx) + merged = dict(existing) + merged.update(new_conf) + print(f"Updating inexact conference entry: {existing.get('name')}") + return insert_sorted(conferences, merged) if new_url: for conf in conferences: @@ -299,6 +402,56 @@ def insert_sorted( return conferences +def date_ranges_overlap( + first: dict[str, typing.Any], second: dict[str, typing.Any] +) -> bool: + """Return True if two conference date ranges overlap.""" + first_fields = conference_date_fields(first) + second_fields = conference_date_fields(second) + return typing.cast(date, first_fields["start_date"]) <= typing.cast( + date, second_fields["end_date"] + ) and typing.cast(date, second_fields["start_date"]) <= typing.cast( + date, first_fields["end_date"] + ) + + +def same_conference_identity( + existing: dict[str, typing.Any], new_conf: dict[str, typing.Any] +) -> bool: + """Return True if two entries appear to represent the same conference.""" + existing_url = existing.get("url") + new_url = new_conf.get("url") + if existing_url and new_url and existing_url == new_url: + return True + + existing_series = existing.get("series") + new_series = new_conf.get("series") + if existing_series and new_series and existing_series == new_series: + return date_ranges_overlap(existing, new_conf) + + return str(existing.get("name", "")).casefold() == str( + new_conf.get("name", "") + ).casefold() and date_ranges_overlap(existing, new_conf) + + +def find_inexact_existing_conference( + conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any] +) -> int | None: + """Return index of an inexact existing entry that exact new data can update.""" + new_fields = conference_date_fields(new_conf) + if new_fields["date_status"] != "exact": + return None + + for idx, existing in enumerate(conferences): + existing_fields = conference_date_fields(existing) + if existing_fields["date_status"] == "exact": + continue + if same_conference_identity(existing, new_conf): + return idx + + return None + + def conference_sort_datetime(conf: dict[str, typing.Any]) -> datetime: """Return conference sort date as a datetime.""" sort_date = conference_date_fields(conf)["sort_date"] @@ -374,6 +527,31 @@ def same_type_as_start( return new_dt +def normalize_dates_field(conf: dict[str, typing.Any]) -> None: + """Move legacy top-level date fields into the nested dates mapping.""" + raw_dates = conf.get("dates") + dates = raw_dates if isinstance(raw_dates, dict) else None + + if dates is None and ("start" in conf or "end" in conf): + start = conf.pop("start", None) + end = conf.pop("end", start) + status = str(conf.pop("date_status", "exact")) + conf["dates"] = {"status": status, "start": start, "end": end} + return + + if dates is not None: + if "start" in conf and "start" not in dates: + dates["start"] = conf["start"] + if "end" in conf and "end" not in dates: + dates["end"] = conf["end"] + if "date_status" in conf and "status" not in dates: + dates["status"] = conf["date_status"] + + conf.pop("start", None) + conf.pop("end", None) + conf.pop("date_status", None) + + def maybe_extract_explicit_end_time(source_text: str) -> int | None: """Extract an explicit 12-hour clock end time for Geomob-style pages.""" lowered = source_text.lower() @@ -435,6 +613,14 @@ def load_conferences(yaml_path: str) -> list[dict[str, typing.Any]]: return typing.cast(list[dict[str, typing.Any]], loaded) +def load_conference_series_for_path(yaml_path: str) -> dict[str, ConferenceSeries]: + """Load conference series next to the target conferences YAML file.""" + return typing.cast( + dict[str, ConferenceSeries], + load_series(data_dir_from_conferences_path(yaml_path)), + ) + + def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None: """Write conference YAML.""" with open(yaml_path, "w") as file: @@ -450,6 +636,9 @@ def add_new_conference(url: str, yaml_path: str) -> bool: if url_has_year_component(url): for conf in conferences: if conf.get("url") == url: + fields = conference_date_fields(conf) + if fields["date_status"] != "exact": + continue print( "⚠️ Conference already exists in YAML " + f"(url={url}), skipping before API call." @@ -459,7 +648,8 @@ def add_new_conference(url: str, yaml_path: str) -> bool: soup = fetch_webpage(url) source_text = webpage_to_text(soup) detected_coordinates = detect_page_coordinates(soup) - prompt = build_prompt(url, source_text, detected_coordinates) + series = load_conference_series_for_path(yaml_path) + prompt = build_prompt(url, source_text, detected_coordinates, series) new_yaml_text = get_from_open_ai(prompt)["yaml"] new_conf = yaml.safe_load(new_yaml_text) @@ -468,7 +658,9 @@ def add_new_conference(url: str, yaml_path: str) -> bool: assert isinstance(new_conf, dict) validate_country(new_conf) + normalize_dates_field(new_conf) normalise_end_field(new_conf, source_text) + normalize_dates_field(new_conf) if detected_coordinates is not None: new_conf["latitude"] = detected_coordinates[0] diff --git a/tests/test_add_new_conference.py b/tests/test_add_new_conference.py index 21338d9..34f80d0 100644 --- a/tests/test_add_new_conference.py +++ b/tests/test_add_new_conference.py @@ -26,6 +26,19 @@ def test_extract_google_maps_latlon_at_pattern() -> None: assert result == (51.5242464, -0.0997024) +def test_url_has_year_component() -> None: + """Only actual year or edition components should count as year-specific.""" + cases = [ + ("https://www.foss4gna.org/", False), + ("https://foss4g.asia/2026/", True), + ("https://2027.fossy.ca/", True), + ("https://www.socallinuxexpo.org/scale/24x/", True), + ("https://2026.stateofthebrowser.com/", True), + ] + for url, expected in cases: + assert add_new_conference.url_has_year_component(url) is expected + + def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None: """The same non-year-specific URL can be reused for a different year.""" conferences: list[dict[str, typing.Any]] = [ @@ -74,6 +87,86 @@ def test_insert_sorted_supports_nested_dates() -> None: assert [conf["name"] for conf in updated] == ["FOSDEM", "PyCascades"] +def test_insert_sorted_updates_inexact_existing_entry() -> None: + """Exact dates should replace an existing inexact series entry.""" + conferences: list[dict[str, typing.Any]] = [ + { + "name": "PyCascades", + "series": "pycascades", + "topic": "Python", + "location": "Seattle, Washington", + "dates": { + "status": "approximate", + "label": "March 2027", + "earliest": date(2027, 3, 1), + "latest": date(2027, 3, 31), + }, + "url": "https://2027.pycascades.com/", + } + ] + new_conf: dict[str, typing.Any] = { + "name": "PyCascades", + "series": "pycascades", + "topic": "Python", + "location": "Seattle, Washington", + "dates": { + "status": "exact", + "start": date(2027, 3, 12), + "end": date(2027, 3, 14), + }, + "url": "https://2027.pycascades.com/", + "venue": "Example Hall", + } + + updated = add_new_conference.insert_sorted(conferences, new_conf) + + assert len(updated) == 1 + assert updated[0]["dates"]["status"] == "exact" + assert updated[0]["dates"]["start"] == date(2027, 3, 12) + assert updated[0]["venue"] == "Example Hall" + + +def test_normalize_dates_field_moves_legacy_dates() -> None: + """Legacy start/end model output should be converted before writing YAML.""" + conf: dict[str, typing.Any] = { + "name": "PyCon", + "start": date(2026, 4, 10), + "end": date(2026, 4, 12), + } + + add_new_conference.normalize_dates_field(conf) + + assert "start" not in conf + assert "end" not in conf + assert conf["dates"] == { + "status": "exact", + "start": date(2026, 4, 10), + "end": date(2026, 4, 12), + } + + +def test_build_prompt_includes_nested_dates_and_series() -> None: + """The prompt should describe nested dates and known series IDs.""" + prompt = add_new_conference.build_prompt( + "https://example.com", + "Conference details", + None, + { + "pycascades": { + "name": "PyCascades", + "topic": "Python", + "usual_location": "Seattle, Washington", + "country": "us", + } + }, + ) + + assert "Do not output legacy top-level `start`, `end`, or `date_status`" in prompt + assert "dates.status" in prompt + assert "- pycascades: PyCascades" in prompt + assert "March 2027" in prompt + + def test_validate_country_normalises_name() -> None: """Country names should be normalised to alpha-2 codes.""" conf: dict[str, typing.Any] = {"country": "United Kingdom"} @@ -197,6 +290,76 @@ def test_add_new_conference_updates_yaml( assert len(written) == 2 assert written[1]["name"] == "NewConf" assert written[1]["country"] == "us" - assert written[1]["end"] == date(2026, 5, 3) + assert written[1]["dates"] == { + "status": "exact", + "start": date(2026, 5, 3), + "end": date(2026, 5, 3), + } assert written[1]["latitude"] == 40.0 assert written[1]["longitude"] == -74.0 + + +def test_add_new_conference_reuses_generic_url_for_new_year( + tmp_path: typing.Any, monkeypatch: pytest.MonkeyPatch +) -> None: + """Generic URLs with digits in the domain should not be skipped early.""" + yaml_path = tmp_path / "conferences.yaml" + yaml_path.write_text( + yaml.dump( + [ + { + "name": "FOSS4G North America", + "series": "foss4g-north-america", + "dates": { + "status": "exact", + "start": date(2025, 11, 3), + "end": date(2025, 11, 5), + }, + "url": "https://www.foss4gna.org/", + } + ], + sort_keys=False, + ) + ) + + root = lxml.html.fromstring("Conference details") + monkeypatch.setattr(add_new_conference, "fetch_webpage", lambda url: root) + monkeypatch.setattr( + add_new_conference, + "webpage_to_text", + lambda parsed: "FOSS4G North America 2026", + ) + monkeypatch.setattr( + add_new_conference, "detect_page_coordinates", lambda parsed: None + ) + monkeypatch.setattr( + add_new_conference, + "get_from_open_ai", + lambda prompt: { + "yaml": yaml.dump( + { + "name": "FOSS4G North America", + "series": "foss4g-north-america", + "topic": "Geospatial", + "location": "St. Louis, Missouri", + "country": "us", + "dates": { + "status": "exact", + "start": date(2026, 10, 26), + "end": date(2026, 10, 29), + }, + "url": "https://www.foss4gna.org/", + }, + sort_keys=False, + ) + }, + ) + + added = add_new_conference.add_new_conference( + "https://www.foss4gna.org/", str(yaml_path) + ) + + assert added is True + written = yaml.safe_load(yaml_path.read_text()) + assert len(written) == 2 + assert [conf["dates"]["start"].year for conf in written] == [2025, 2026]