Improve conference importer date handling

This commit is contained in:
Edward Betts 2026-06-22 12:41:33 +01:00
parent 56eea3f7a3
commit dbce9e5358
2 changed files with 401 additions and 46 deletions

View file

@ -16,7 +16,7 @@ import pycountry
import requests
import yaml
from agenda.conference import conference_date_fields
from agenda.conference import ConferenceSeries, conference_date_fields, load_series
USER_AGENT = "add-new-conference/0.1"
COORDINATE_PATTERNS = (
@ -36,58 +36,139 @@ def read_api_key() -> str:
return parser["openai"]["api_key"]
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
) -> str:
"""Build prompt with embedded YAML examples."""
examples = """
def conference_yaml_format_description() -> str:
"""Return the conference YAML format description for LLM prompts."""
return """
Use this YAML format for one conference entry.
Required fields:
- `name`: event name.
- `topic`: topic/category.
- `location`: city or location label. Use `TBC` if the page confirms a future
event but not a city.
- Date information in nested `dates`.
Preferred date shape:
- `dates.status`: one of `exact`, `tentative`, or `approximate`.
- For `exact`: use when the page confirms specific dates/times. Include
`dates.start` and `dates.end` as YAML dates or timezone-aware datetimes.
- For `tentative`: use when specific dates are guessed or explicitly
unconfirmed. Include `dates.start`, `dates.end`, and preferably `dates.label`
and `dates.basis`.
- For `approximate`: use when only a broad date phrase is known. Include
`dates.label`, `dates.earliest`, and `dates.latest`. Examples: `March 2027`
should become earliest `2027-03-01`, latest `2027-03-31`; `mid-April 2027`
should become a sensible bounded range such as `2027-04-11` to `2027-04-20`.
Important date rule:
- If the source page contains exact dates, output `dates.status: exact` even if
the existing agenda entry or conference announcement previously had only
approximate dates.
- Always include an end date for `exact` and `tentative`. For a single-day
event, `dates.end` can be the same as `dates.start`.
- Do not output legacy top-level `start`, `end`, or `date_status`.
Common optional fields:
- `series`: a key from the known conference series list, when this event belongs
to a listed series.
- `country`: valid ISO 3166-1 alpha-2 country code in lowercase, for example
`ca`, `gb`, `us`. Do not output country names.
- `venue`, `address`, `latitude`, `longitude`, `url`, `cfp_url`, `cfp_end`,
`hashtag`, `description`.
- `free`, `price`, `currency`, `hackathon`, `online`, `attendees`.
- Do not include `going`, `registered`, `accommodation_booked`,
`transport_booked`, or `trip` unless the source explicitly says they apply to
my attendance.
"""
def yaml_example_text() -> str:
"""Return examples of the conference YAML format."""
return """
- name: Geomob London
series: geomob-london
topic: Maps
location: London
country: gb
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 23:00:00+00:00
dates:
status: exact
start: 2026-01-28 18:00:00+00:00
end: 2026-01-28 22:00:00+00:00
url: https://thegeomob.com/post/jan-28th-2026-geomoblon-details
venue: Geovation Hub
address: Sutton Yard, 65 Goswell Rd, London EC1V 7EN
latitude: 51.5242464
longitude: -0.0997024
free: true
going: true
hashtag: '#geomobLON'
- name: DebConf 25
series: debconf
topic: Debian
location: Plouzané (Breast)
location: Plouzane
country: fr
start: 2025-07-07
end: 2025-07-20
dates:
status: exact
start: 2025-07-07
end: 2025-07-20
url: https://wiki.debian.org/DebConf/25
going: true
cfp_url: https://debconf25.debconf.org/talks/new/
venue: École nationale supérieure Mines-Télécom Atlantique Bretagne Pays de la Loire
venue: Ecole nationale superieure Mines-Telecom Atlantique Bretagne Pays de la Loire
campus de Brest
latitude: 48.35934
longitude: -4.569889
- name: Wikimedia Hackathon
series: wikimedia-hackathon
topic: Wikimedia
location: Istanbul
country: tr
start: 2025-05-02
end: 2025-05-04
venue: Renaissance Polat Istanbul Hotel
address: Yeşilyurt, Sahil Yolu Cd. No:2, 34149 Bakırköy/İstanbul
latitude: 40.959946
longitude: 28.838763
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2025
going: true
free: true
location: Albania
country: al
dates:
status: approximate
label: mid-April 2027
earliest: 2027-04-11
latest: 2027-04-20
url: https://www.mediawiki.org/wiki/Wikimedia_Hackathon_2027
hackathon: true
registered: true
- name: PyCascades
series: pycascades
topic: Python
location: Seattle, Washington
country: us
dates:
status: approximate
label: March 2027
earliest: 2027-03-01
latest: 2027-03-31
"""
def series_prompt_text(series: dict[str, ConferenceSeries]) -> str:
"""Return compact known series text for the LLM prompt."""
if not series:
return "No known conference series loaded."
lines = ["Known conference series IDs:"]
for series_id, item in sorted(series.items()):
details = [item["name"]]
if topic := item.get("topic"):
details.append(f"topic: {topic}")
if location := item.get("usual_location"):
details.append(f"usual location: {location}")
if country := item.get("country"):
details.append(f"country: {country}")
lines.append(f"- {series_id}: " + "; ".join(details))
return "\n".join(lines)
def build_prompt(
url: str,
source_text: str,
detected_coordinates: tuple[float, float] | None,
series: dict[str, ConferenceSeries] | None = None,
) -> str:
"""Build prompt with embedded YAML format details and examples."""
coordinate_note = ""
if detected_coordinates is not None:
coordinate_note = (
@ -99,28 +180,26 @@ def build_prompt(
prompt = f"""
I keep a record of interesting conferences in a YAML file.
Format rules:
{conference_yaml_format_description()}
{series_prompt_text(series or {})}
Here are some examples of the format I use:
{examples}
{yaml_example_text()}
Now here is a new conference of interest:
Conference URL: {url}
Return the YAML representation for this conference following the
same style and keys as the examples. Only include keys if the
information is available. Do not invent details.
Return the YAML representation for this conference following the same style and
keys as the examples. Only include keys if the information is available. Do not
invent details.
Important: the `country` field must always be a valid ISO 3166-1 alpha-2
country code (two lowercase letters, e.g. `ca` for Canada, `gb` for United Kingdom).
Do not output full country names.
Important: always include an `end` field. If the event is a single-day event,
the `end` can be the same date as `start`, or a same-day datetime if the page
provides an end time.
Important: if this is a Geomob event, use an `end` datetime of 22:00 local time
on the event date unless the page explicitly provides a different end time.
Important: if this is a Geomob event, use a `dates.end` datetime of 22:00 local
time on the event date unless the page explicitly provides a different end time.
{coordinate_note}
Wrap your answer in a JSON object with a single key "yaml".
@ -261,9 +340,26 @@ def parse_date(date_str: str) -> datetime:
return dt
def data_dir_from_conferences_path(yaml_path: str) -> str:
"""Return personal-data directory from a conferences.yaml path."""
return os.path.dirname(os.path.abspath(yaml_path))
def url_has_year_component(url: str) -> bool:
"""Return True if the URL contains any digit."""
return any(ch.isdigit() for ch in url)
"""Return True if the URL contains a year or edition path component."""
parsed = urlparse(url)
components = [part for part in parsed.path.split("/") if part]
if parsed.netloc:
components.extend(part for part in parsed.netloc.split(".") if part)
for component in components:
if re.fullmatch(r"20\d{2}", component):
return True
if re.search(r"(?:^|[-_/])20\d{2}(?:$|[-_/])", component):
return True
if re.fullmatch(r"\d{1,2}x", component, flags=re.IGNORECASE):
return True
return False
def insert_sorted(
@ -273,6 +369,13 @@ def insert_sorted(
new_url = new_conf.get("url")
new_start = conference_sort_datetime(new_conf)
new_year = new_start.year
update_idx = find_inexact_existing_conference(conferences, new_conf)
if update_idx is not None:
existing = conferences.pop(update_idx)
merged = dict(existing)
merged.update(new_conf)
print(f"Updating inexact conference entry: {existing.get('name')}")
return insert_sorted(conferences, merged)
if new_url:
for conf in conferences:
@ -299,6 +402,56 @@ def insert_sorted(
return conferences
def date_ranges_overlap(
first: dict[str, typing.Any], second: dict[str, typing.Any]
) -> bool:
"""Return True if two conference date ranges overlap."""
first_fields = conference_date_fields(first)
second_fields = conference_date_fields(second)
return typing.cast(date, first_fields["start_date"]) <= typing.cast(
date, second_fields["end_date"]
) and typing.cast(date, second_fields["start_date"]) <= typing.cast(
date, first_fields["end_date"]
)
def same_conference_identity(
existing: dict[str, typing.Any], new_conf: dict[str, typing.Any]
) -> bool:
"""Return True if two entries appear to represent the same conference."""
existing_url = existing.get("url")
new_url = new_conf.get("url")
if existing_url and new_url and existing_url == new_url:
return True
existing_series = existing.get("series")
new_series = new_conf.get("series")
if existing_series and new_series and existing_series == new_series:
return date_ranges_overlap(existing, new_conf)
return str(existing.get("name", "")).casefold() == str(
new_conf.get("name", "")
).casefold() and date_ranges_overlap(existing, new_conf)
def find_inexact_existing_conference(
conferences: list[dict[str, typing.Any]], new_conf: dict[str, typing.Any]
) -> int | None:
"""Return index of an inexact existing entry that exact new data can update."""
new_fields = conference_date_fields(new_conf)
if new_fields["date_status"] != "exact":
return None
for idx, existing in enumerate(conferences):
existing_fields = conference_date_fields(existing)
if existing_fields["date_status"] == "exact":
continue
if same_conference_identity(existing, new_conf):
return idx
return None
def conference_sort_datetime(conf: dict[str, typing.Any]) -> datetime:
"""Return conference sort date as a datetime."""
sort_date = conference_date_fields(conf)["sort_date"]
@ -374,6 +527,31 @@ def same_type_as_start(
return new_dt
def normalize_dates_field(conf: dict[str, typing.Any]) -> None:
"""Move legacy top-level date fields into the nested dates mapping."""
raw_dates = conf.get("dates")
dates = raw_dates if isinstance(raw_dates, dict) else None
if dates is None and ("start" in conf or "end" in conf):
start = conf.pop("start", None)
end = conf.pop("end", start)
status = str(conf.pop("date_status", "exact"))
conf["dates"] = {"status": status, "start": start, "end": end}
return
if dates is not None:
if "start" in conf and "start" not in dates:
dates["start"] = conf["start"]
if "end" in conf and "end" not in dates:
dates["end"] = conf["end"]
if "date_status" in conf and "status" not in dates:
dates["status"] = conf["date_status"]
conf.pop("start", None)
conf.pop("end", None)
conf.pop("date_status", None)
def maybe_extract_explicit_end_time(source_text: str) -> int | None:
"""Extract an explicit 12-hour clock end time for Geomob-style pages."""
lowered = source_text.lower()
@ -435,6 +613,14 @@ def load_conferences(yaml_path: str) -> list[dict[str, typing.Any]]:
return typing.cast(list[dict[str, typing.Any]], loaded)
def load_conference_series_for_path(yaml_path: str) -> dict[str, ConferenceSeries]:
"""Load conference series next to the target conferences YAML file."""
return typing.cast(
dict[str, ConferenceSeries],
load_series(data_dir_from_conferences_path(yaml_path)),
)
def dump_conferences(yaml_path: str, conferences: list[dict[str, typing.Any]]) -> None:
"""Write conference YAML."""
with open(yaml_path, "w") as file:
@ -450,6 +636,9 @@ def add_new_conference(url: str, yaml_path: str) -> bool:
if url_has_year_component(url):
for conf in conferences:
if conf.get("url") == url:
fields = conference_date_fields(conf)
if fields["date_status"] != "exact":
continue
print(
"⚠️ Conference already exists in YAML "
+ f"(url={url}), skipping before API call."
@ -459,7 +648,8 @@ def add_new_conference(url: str, yaml_path: str) -> bool:
soup = fetch_webpage(url)
source_text = webpage_to_text(soup)
detected_coordinates = detect_page_coordinates(soup)
prompt = build_prompt(url, source_text, detected_coordinates)
series = load_conference_series_for_path(yaml_path)
prompt = build_prompt(url, source_text, detected_coordinates, series)
new_yaml_text = get_from_open_ai(prompt)["yaml"]
new_conf = yaml.safe_load(new_yaml_text)
@ -468,7 +658,9 @@ def add_new_conference(url: str, yaml_path: str) -> bool:
assert isinstance(new_conf, dict)
validate_country(new_conf)
normalize_dates_field(new_conf)
normalise_end_field(new_conf, source_text)
normalize_dates_field(new_conf)
if detected_coordinates is not None:
new_conf["latitude"] = detected_coordinates[0]

View file

@ -26,6 +26,19 @@ def test_extract_google_maps_latlon_at_pattern() -> None:
assert result == (51.5242464, -0.0997024)
def test_url_has_year_component() -> None:
"""Only actual year or edition components should count as year-specific."""
cases = [
("https://www.foss4gna.org/", False),
("https://foss4g.asia/2026/", True),
("https://2027.fossy.ca/", True),
("https://www.socallinuxexpo.org/scale/24x/", True),
("https://2026.stateofthebrowser.com/", True),
]
for url, expected in cases:
assert add_new_conference.url_has_year_component(url) is expected
def test_insert_sorted_allows_same_url_different_year_without_year_component() -> None:
"""The same non-year-specific URL can be reused for a different year."""
conferences: list[dict[str, typing.Any]] = [
@ -74,6 +87,86 @@ def test_insert_sorted_supports_nested_dates() -> None:
assert [conf["name"] for conf in updated] == ["FOSDEM", "PyCascades"]
def test_insert_sorted_updates_inexact_existing_entry() -> None:
"""Exact dates should replace an existing inexact series entry."""
conferences: list[dict[str, typing.Any]] = [
{
"name": "PyCascades",
"series": "pycascades",
"topic": "Python",
"location": "Seattle, Washington",
"dates": {
"status": "approximate",
"label": "March 2027",
"earliest": date(2027, 3, 1),
"latest": date(2027, 3, 31),
},
"url": "https://2027.pycascades.com/",
}
]
new_conf: dict[str, typing.Any] = {
"name": "PyCascades",
"series": "pycascades",
"topic": "Python",
"location": "Seattle, Washington",
"dates": {
"status": "exact",
"start": date(2027, 3, 12),
"end": date(2027, 3, 14),
},
"url": "https://2027.pycascades.com/",
"venue": "Example Hall",
}
updated = add_new_conference.insert_sorted(conferences, new_conf)
assert len(updated) == 1
assert updated[0]["dates"]["status"] == "exact"
assert updated[0]["dates"]["start"] == date(2027, 3, 12)
assert updated[0]["venue"] == "Example Hall"
def test_normalize_dates_field_moves_legacy_dates() -> None:
"""Legacy start/end model output should be converted before writing YAML."""
conf: dict[str, typing.Any] = {
"name": "PyCon",
"start": date(2026, 4, 10),
"end": date(2026, 4, 12),
}
add_new_conference.normalize_dates_field(conf)
assert "start" not in conf
assert "end" not in conf
assert conf["dates"] == {
"status": "exact",
"start": date(2026, 4, 10),
"end": date(2026, 4, 12),
}
def test_build_prompt_includes_nested_dates_and_series() -> None:
"""The prompt should describe nested dates and known series IDs."""
prompt = add_new_conference.build_prompt(
"https://example.com",
"Conference details",
None,
{
"pycascades": {
"name": "PyCascades",
"topic": "Python",
"usual_location": "Seattle, Washington",
"country": "us",
}
},
)
assert "Do not output legacy top-level `start`, `end`, or `date_status`" in prompt
assert "dates.status" in prompt
assert "- pycascades: PyCascades" in prompt
assert "March 2027" in prompt
def test_validate_country_normalises_name() -> None:
"""Country names should be normalised to alpha-2 codes."""
conf: dict[str, typing.Any] = {"country": "United Kingdom"}
@ -197,6 +290,76 @@ def test_add_new_conference_updates_yaml(
assert len(written) == 2
assert written[1]["name"] == "NewConf"
assert written[1]["country"] == "us"
assert written[1]["end"] == date(2026, 5, 3)
assert written[1]["dates"] == {
"status": "exact",
"start": date(2026, 5, 3),
"end": date(2026, 5, 3),
}
assert written[1]["latitude"] == 40.0
assert written[1]["longitude"] == -74.0
def test_add_new_conference_reuses_generic_url_for_new_year(
tmp_path: typing.Any, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Generic URLs with digits in the domain should not be skipped early."""
yaml_path = tmp_path / "conferences.yaml"
yaml_path.write_text(
yaml.dump(
[
{
"name": "FOSS4G North America",
"series": "foss4g-north-america",
"dates": {
"status": "exact",
"start": date(2025, 11, 3),
"end": date(2025, 11, 5),
},
"url": "https://www.foss4gna.org/",
}
],
sort_keys=False,
)
)
root = lxml.html.fromstring("<html><body>Conference details</body></html>")
monkeypatch.setattr(add_new_conference, "fetch_webpage", lambda url: root)
monkeypatch.setattr(
add_new_conference,
"webpage_to_text",
lambda parsed: "FOSS4G North America 2026",
)
monkeypatch.setattr(
add_new_conference, "detect_page_coordinates", lambda parsed: None
)
monkeypatch.setattr(
add_new_conference,
"get_from_open_ai",
lambda prompt: {
"yaml": yaml.dump(
{
"name": "FOSS4G North America",
"series": "foss4g-north-america",
"topic": "Geospatial",
"location": "St. Louis, Missouri",
"country": "us",
"dates": {
"status": "exact",
"start": date(2026, 10, 26),
"end": date(2026, 10, 29),
},
"url": "https://www.foss4gna.org/",
},
sort_keys=False,
)
},
)
added = add_new_conference.add_new_conference(
"https://www.foss4gna.org/", str(yaml_path)
)
assert added is True
written = yaml.safe_load(yaml_path.read_text())
assert len(written) == 2
assert [conf["dates"]["start"].year for conf in written] == [2025, 2026]