Improve conference importer date handling

This commit is contained in:
Edward Betts 2026-06-22 12:41:33 +01:00
parent 56eea3f7a3
commit dbce9e5358
2 changed files with 401 additions and 46 deletions

View file

@ -16,7 +16,7 @@ import pycountry
import requests import requests
import yaml import yaml
from agenda.conference import conference_date_fields from agenda.conference import ConferenceSeries, conference_date_fields, load_series
USER_AGENT = "add-new-conference/0.1" USER_AGENT = "add-new-conference/0.1"
COORDINATE_PATTERNS = ( COORDINATE_PATTERNS = (
@ -36,58 +36,139 @@ def read_api_key() -> str:
return parser["openai"]["api_key"] return parser["openai"]["api_key"]
def build_prompt( def conference_yaml_format_description() -> str:
url: str, """Return the conference YAML format description for LLM prompts."""
source_text: str, return """
detected_coordinates: tuple[float, float] | None, Use this YAML format for one conference entry.
) -> str:
"""Build prompt with embedded YAML examples.""" Required fields:
examples = """ - `name`: event name.
- `topic`: topic/category.
- `location`: city or location label. Use `TBC` if the page confirms a future
event but not a city.
- Date information in nested `dates`.
Preferred date shape:
- `dates.status`: one of `exact`, `tentative`, or `approximate`.
- For `exact`: use when the page confirms specific dates/times. Include
`dates.start` and `dates.end` as YAML dates or timezone-aware datetimes.
- For `tentative`: use when specific dates are guessed or explicitly
unconfirmed. Include `dates.start`, `dates.end`, and preferably `dates.label`
and `dates.basis`.
- For `approximate`: use when only a broad date phrase is known. Include
`dates.label`, `dates.earliest`, and `dates.latest`. Examples: `March 2027`
should become earliest `2027-03-01`, latest `2027-03-31`; `mid-April 2027`
should become a sensible bounded range such as `2027-04-11` to `2027-04-20`.
Important date rule:
- If the source page contains exact dates, output `dates.status: exact` even if
the existing agenda entry or conference announcement previously had only
approximate dates.
- Always include an end date for `exact` and `tentative`. For a single-day
event, `dates.end` can be the same as `dates.start`.
- Do not output legacy top-level `start`, `end`, or `date_status`.
Common optional fields:
- `series`: a key from the known conference series list, when this event belongs
to a listed series.
- `country`: valid ISO 3166-1 alpha-2 country code in lowercase, for example
`ca`, `gb`, `us`. Do not output country names.
- `venue`, `address`, `latitude`, `longitude`, `url`, `cfp_url`, `cfp_end`,
`hashtag`, `description`.
- `free`, `price`, `currency`, `hackathon`, `online`, `attendees`.
- Do not include `going`, `registered`, `accommodation_booked`,
`transport_booked`, or `trip` unless the source explicitly says they apply to
my attendance.
"""
def yaml_example_text() -> str:
"""Return examples of the conference YAML format."""
return """
- name: Geomob London - name: Geomob London
series: geomob-london
topic: Maps topic: Maps
location: London location: London
country: gb country: gb
start: 2026-01-28 18:00:00+00:00 dates:
end: 2026-01-28 23:00:00+00:00 status: exact
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 22:00:00+00:00
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
venue: Geovation Hub venue: Geovation Hub
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
latitude: 51.5242464 latitude: 51.5242464
longitude: -0.0997024 longitude: -0.0997024
free: true free: true
going: true
hashtag: '#geomobLON' hashtag: '#geomobLON'
- name: DebConf 25 - name: DebConf 25
series: debconf
topic: Debian topic: Debian
location: Plouzané (Breast) location: Plouzane
country: fr country: fr
start: 2025-07-07 dates:
end: 2025-07-20 status: exact
start: 2025-07-07
end: 2025-07-20
url: https://wiki.debian.org/DebConf/25 url: https://wiki.debian.org/DebConf/25
going: true
cfp_url: https://debconf25.debconf.org/talks/new/ cfp_url: https://debconf25.debconf.org/talks/new/
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire venue: Ecole nationale superieure Mines-Telecom Atlantique Bretagne Pays de la Loire
campus de Brest campus de Brest
latitude: 48.35934 latitude: 48.35934
longitude: -4.569889 longitude: -4.569889
- name: Wikimedia Hackathon - name: Wikimedia Hackathon
series: wikimedia-hackathon
topic: Wikimedia topic: Wikimedia
location: Istanbul location: Albania
country: tr country: al
start: 2025-05-02 dates:
end: 2025-05-04 status: approximate
venue: Renaissance Polat Istanbul Hotel label: mid-April 2027
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul earliest: 2027-04-11
latitude: 40.959946 latest: 2027-04-20
longitude: 28.838763 url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2027
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
going: true
free: true
hackathon: true hackathon: true
registered: true
- name: PyCascades
series: pycascades
topic: Python
location: Seattle, Washington
country: us
dates:
status: approximate
label: March 2027
earliest: 2027-03-01
latest: 2027-03-31
""" """
def series_prompt_text(series: dict[str, ConferenceSeries]) -> str:
"""Return compact known series text for the LLM prompt."""
if not series:
return "No known conference series loaded."
lines = ["Known conference series IDs:"]
for series_id, item in sorted(series.items()):
details = [item["name"]]
if topic := item.get("topic"):
details.append(f"topic: {topic}")
if location := item.get("usual_location"):
details.append(f"usual location: {location}")
if country := item.get("country"):
details.append(f"country: {country}")
lines.append(f"- {series_id}: " + "; ".join(details))
return "\n".join(lines)
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
series: dict[str, ConferenceSeries] | None = None,
) -> str:
"""Build prompt with embedded YAML format details and examples."""
coordinate_note = "" coordinate_note = ""
if detected_coordinates is not None: if detected_coordinates is not None:
coordinate_note = ( coordinate_note = (
@ -99,28 +180,26 @@ def build_prompt(
prompt = f""" prompt = f"""
I keep a record of interesting conferences in a YAML file. I keep a record of interesting conferences in a YAML file.
Format rules:
{conference_yaml_format_description()}
{series_prompt_text(series or {})}
Here are some examples of the format I use: Here are some examples of the format I use:
{examples} {yaml_example_text()}
Now here is a new conference of interest: Now here is a new conference of interest:
Conference URL: {url} Conference URL: {url}
Return the YAML representation for this conference following the Return the YAML representation for this conference following the same style and
same style and keys as the examples. Only include keys if the keys as the examples. Only include keys if the information is available. Do not
information is available. Do not invent details. invent details.
Important: the `country` field must always be a valid ISO 3166-1 alpha-2 Important: if this is a Geomob event, use a `dates.end` datetime of 22:00 local
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom). time on the event date unless the page explicitly provides a different end time.
Do not output full country names.
Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
{coordinate_note} {coordinate_note}
Wrap your answer in a JSON object with a single key "yaml". Wrap your answer in a JSON object with a single key "yaml".
@ -261,9 +340,26 @@ def parse_date(date_str: str) -> datetime:
return dt return dt
def data_dir_from_conferences_path(yaml_path: str) -> str:
"""Return personal-data directory from a conferences.yaml path."""
return os.path.dirname(os.path.abspath(yaml_path))
def url_has_year_component(url: str) -> bool: def url_has_year_component(url: str) -> bool:
"""Return True if the URL contains any digit.""" """Return True if the URL contains a year or edition path component."""
return any(ch.isdigit() for ch in url) parsed = urlparse(url)
components = [part for part in parsed.path.split("/") if part]
if parsed.netloc:
components.extend(part for part in parsed.netloc.split(".") if part)
for component in components:
if re.fullmatch(r"20\d{2}", component):
return True
if re.search(r"(?:^|[-_/])20\d{2}(?:$|[-_/])", component):
return True
if re.fullmatch(r"\d{1,2}x", component, flags=re.IGNORECASE):
return True
return False
def insert_sorted( def insert_sorted(
@ -273,6 +369,13 @@ def insert_sorted(
new_url = new_conf.get("url") new_url = new_conf.get("url")
new_start = conference_sort_datetime(new_conf) new_start = conference_sort_datetime(new_conf)
new_year = new_start.year new_year = new_start.year
update_idx = find_inexact_existing_conference(conferences, new_conf)
if update_idx is not None:
existing = conferences.pop(update_idx)
merged = dict(existing)
merged.update(new_conf)
print(f"Updating inexact conference entry: {existing.get('name')}")
return insert_sorted(conferences, merged)
if new_url: if new_url:
for conf in conferences: for conf in conferences:
@ -299,6 +402,56 @@ def insert_sorted(
return conferences return conferences
def date_ranges_overlap(
first: dict[str, typing.Any], second: dict[str, typing.Any]
) -> bool:
"""Return True if two conference date ranges overlap."""
first_fields = conference_date_fields(first)
second_fields = conference_date_fields(second)
return typing.cast(date, first_fields["start_date"]) <= typing.cast(
date, second_fields["end_date"]
) and typing.cast(date, second_fields["start_date"]) <= typing.cast(
date, first_fields["end_date"]
)
def same_conference_identity(
existing: dict[str, typing.Any], new_conf: dict[str, typing.Any]
) -> bool:
"""Return True if two entries appear to represent the same conference."""
existing_url = existing.get("url")
new_url = new_conf.get("url")
if existing_url and new_url and existing_url == new_url:
return True
existing_series = existing.get("series")
new_series = new_conf.get("series")
if existing_series and new_series and existing_series == new_series:
return date_ranges_overlap(existing, new_conf)
return str(existing.get("name", "")).casefold() == str(
new_conf.get("name", "")
).casefold() and date_ranges_overlap(existing, new_conf)
def find_inexact_existing_conference(
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> int | None:
"""Return index of an inexact existing entry that exact new data can update."""
new_fields = conference_date_fields(new_conf)
if new_fields["date_status"] != "exact":
return None
for idx, existing in enumerate(conferences):
existing_fields = conference_date_fields(existing)
if existing_fields["date_status"] == "exact":
continue
if same_conference_identity(existing, new_conf):
return idx
return None
def conference_sort_datetime(conf: dict[str, typing.Any]) -> datetime: def conference_sort_datetime(conf: dict[str, typing.Any]) -> datetime:
"""Return conference sort date as a datetime.""" """Return conference sort date as a datetime."""
sort_date = conference_date_fields(conf)["sort_date"] sort_date = conference_date_fields(conf)["sort_date"]
@ -374,6 +527,31 @@ def same_type_as_start(
return new_dt return new_dt
def normalize_dates_field(conf: dict[str, typing.Any]) -> None:
"""Move legacy top-level date fields into the nested dates mapping."""
raw_dates = conf.get("dates")
dates = raw_dates if isinstance(raw_dates, dict) else None
if dates is None and ("start" in conf or "end" in conf):
start = conf.pop("start", None)
end = conf.pop("end", start)
status = str(conf.pop("date_status", "exact"))
conf["dates"] = {"status": status, "start": start, "end": end}
return
if dates is not None:
if "start" in conf and "start" not in dates:
dates["start"] = conf["start"]
if "end" in conf and "end" not in dates:
dates["end"] = conf["end"]
if "date_status" in conf and "status" not in dates:
dates["status"] = conf["date_status"]
conf.pop("start", None)
conf.pop("end", None)
conf.pop("date_status", None)
def maybe_extract_explicit_end_time(source_text: str) -> int | None: def maybe_extract_explicit_end_time(source_text: str) -> int | None:
"""Extract an explicit 12-hour clock end time for Geomob-style pages.""" """Extract an explicit 12-hour clock end time for Geomob-style pages."""
lowered = source_text.lower() lowered = source_text.lower()
@ -435,6 +613,14 @@ def load_conferences(yaml_path: str) -> list[dict[str, typing.Any]]:
return typing.cast(list[dict[str, typing.Any]], loaded) return typing.cast(list[dict[str, typing.Any]], loaded)
def load_conference_series_for_path(yaml_path: str) -> dict[str, ConferenceSeries]:
"""Load conference series next to the target conferences YAML file."""
return typing.cast(
dict[str, ConferenceSeries],
load_series(data_dir_from_conferences_path(yaml_path)),
)
def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None: def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None:
"""Write conference YAML.""" """Write conference YAML."""
with open(yaml_path, "w") as file: with open(yaml_path, "w") as file:
@ -450,6 +636,9 @@ def add_new_conference(url: str, yaml_path: str) -> bool:
if url_has_year_component(url): if url_has_year_component(url):
for conf in conferences: for conf in conferences:
if conf.get("url") == url: if conf.get("url") == url:
fields = conference_date_fields(conf)
if fields["date_status"] != "exact":
continue
print( print(
"⚠️ Conference already exists in YAML " "⚠️ Conference already exists in YAML "
+ f"(url={url}), skipping before API call." + f"(url={url}), skipping before API call."
@ -459,7 +648,8 @@ def add_new_conference(url: str, yaml_path: str) -> bool:
soup = fetch_webpage(url) soup = fetch_webpage(url)
source_text = webpage_to_text(soup) source_text = webpage_to_text(soup)
detected_coordinates = detect_page_coordinates(soup) detected_coordinates = detect_page_coordinates(soup)
prompt = build_prompt(url, source_text, detected_coordinates) series = load_conference_series_for_path(yaml_path)
prompt = build_prompt(url, source_text, detected_coordinates, series)
new_yaml_text = get_from_open_ai(prompt)["yaml"] new_yaml_text = get_from_open_ai(prompt)["yaml"]
new_conf = yaml.safe_load(new_yaml_text) new_conf = yaml.safe_load(new_yaml_text)
@ -468,7 +658,9 @@ def add_new_conference(url: str, yaml_path: str) -> bool:
assert isinstance(new_conf, dict) assert isinstance(new_conf, dict)
validate_country(new_conf) validate_country(new_conf)
normalize_dates_field(new_conf)
normalise_end_field(new_conf, source_text) normalise_end_field(new_conf, source_text)
normalize_dates_field(new_conf)
if detected_coordinates is not None: if detected_coordinates is not None:
new_conf["latitude"] = detected_coordinates[0] new_conf["latitude"] = detected_coordinates[0]

View file

@ -26,6 +26,19 @@ def test_extract_google_maps_latlon_at_pattern() -> None:
assert result == (51.5242464, -0.0997024) assert result == (51.5242464, -0.0997024)
def test_url_has_year_component() -> None:
"""Only actual year or edition components should count as year-specific."""
cases = [
("https://www.foss4gna.org/", False),
("https://foss4g.asia/2026/", True),
("https://2027.fossy.ca/", True),
("https://www.socallinuxexpo.org/scale/24x/", True),
("https://2026.stateofthebrowser.com/", True),
]
for url, expected in cases:
assert add_new_conference.url_has_year_component(url) is expected
def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None: def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None:
"""The same non-year-specific URL can be reused for a different year.""" """The same non-year-specific URL can be reused for a different year."""
conferences: list[dict[str, typing.Any]] = [ conferences: list[dict[str, typing.Any]] = [
@ -74,6 +87,86 @@ def test_insert_sorted_supports_nested_dates() -> None:
assert [conf["name"] for conf in updated] == ["FOSDEM", "PyCascades"] assert [conf["name"] for conf in updated] == ["FOSDEM", "PyCascades"]
def test_insert_sorted_updates_inexact_existing_entry() -> None:
"""Exact dates should replace an existing inexact series entry."""
conferences: list[dict[str, typing.Any]] = [
{
"name": "PyCascades",
"series": "pycascades",
"topic": "Python",
"location": "Seattle, Washington",
"dates": {
"status": "approximate",
"label": "March 2027",
"earliest": date(2027, 3, 1),
"latest": date(2027, 3, 31),
},
"url": "https://2027.pycascades.com/",
}
]
new_conf: dict[str, typing.Any] = {
"name": "PyCascades",
"series": "pycascades",
"topic": "Python",
"location": "Seattle, Washington",
"dates": {
"status": "exact",
"start": date(2027, 3, 12),
"end": date(2027, 3, 14),
},
"url": "https://2027.pycascades.com/",
"venue": "Example Hall",
}
updated = add_new_conference.insert_sorted(conferences, new_conf)
assert len(updated) == 1
assert updated[0]["dates"]["status"] == "exact"
assert updated[0]["dates"]["start"] == date(2027, 3, 12)
assert updated[0]["venue"] == "Example Hall"
def test_normalize_dates_field_moves_legacy_dates() -> None:
"""Legacy start/end model output should be converted before writing YAML."""
conf: dict[str, typing.Any] = {
"name": "PyCon",
"start": date(2026, 4, 10),
"end": date(2026, 4, 12),
}
add_new_conference.normalize_dates_field(conf)
assert "start" not in conf
assert "end" not in conf
assert conf["dates"] == {
"status": "exact",
"start": date(2026, 4, 10),
"end": date(2026, 4, 12),
}
def test_build_prompt_includes_nested_dates_and_series() -> None:
"""The prompt should describe nested dates and known series IDs."""
prompt = add_new_conference.build_prompt(
"https://example.com",
"Conference details",
None,
{
"pycascades": {
"name": "PyCascades",
"topic": "Python",
"usual_location": "Seattle, Washington",
"country": "us",
}
},
)
assert "Do not output legacy top-level `start`, `end`, or `date_status`" in prompt
assert "dates.status" in prompt
assert "- pycascades: PyCascades" in prompt
assert "March 2027" in prompt
def test_validate_country_normalises_name() -> None: def test_validate_country_normalises_name() -> None:
"""Country names should be normalised to alpha-2 codes.""" """Country names should be normalised to alpha-2 codes."""
conf: dict[str, typing.Any] = {"country": "United Kingdom"} conf: dict[str, typing.Any] = {"country": "United Kingdom"}
@ -197,6 +290,76 @@ def test_add_new_conference_updates_yaml(
assert len(written) == 2 assert len(written) == 2
assert written[1]["name"] == "NewConf" assert written[1]["name"] == "NewConf"
assert written[1]["country"] == "us" assert written[1]["country"] == "us"
assert written[1]["end"] == date(2026, 5, 3) assert written[1]["dates"] == {
"status": "exact",
"start": date(2026, 5, 3),
"end": date(2026, 5, 3),
}
assert written[1]["latitude"] == 40.0 assert written[1]["latitude"] == 40.0
assert written[1]["longitude"] == -74.0 assert written[1]["longitude"] == -74.0
def test_add_new_conference_reuses_generic_url_for_new_year(
tmp_path: typing.Any, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Generic URLs with digits in the domain should not be skipped early."""
yaml_path = tmp_path / "conferences.yaml"
yaml_path.write_text(
yaml.dump(
[
{
"name": "FOSS4G North America",
"series": "foss4g-north-america",
"dates": {
"status": "exact",
"start": date(2025, 11, 3),
"end": date(2025, 11, 5),
},
"url": "https://www.foss4gna.org/",
}
],
sort_keys=False,
)
)
root = lxml.html.fromstring("<html><body>Conference details</body></html>")
monkeypatch.setattr(add_new_conference, "fetch_webpage", lambda url: root)
monkeypatch.setattr(
add_new_conference,
"webpage_to_text",
lambda parsed: "FOSS4G North America 2026",
)
monkeypatch.setattr(
add_new_conference, "detect_page_coordinates", lambda parsed: None
)
monkeypatch.setattr(
add_new_conference,
"get_from_open_ai",
lambda prompt: {
"yaml": yaml.dump(
{
"name": "FOSS4G North America",
"series": "foss4g-north-america",
"topic": "Geospatial",
"location": "St. Louis, Missouri",
"country": "us",
"dates": {
"status": "exact",
"start": date(2026, 10, 26),
"end": date(2026, 10, 29),
},
"url": "https://www.foss4gna.org/",
},
sort_keys=False,
)
},
)
added = add_new_conference.add_new_conference(
"https://www.foss4gna.org/", str(yaml_path)
)
assert added is True
written = yaml.safe_load(yaml_path.read_text())
assert len(written) == 2
assert [conf["dates"]["start"].year for conf in written] == [2025, 2026]