agenda/validate_yaml.py

426 lines
14 KiB
Python
Executable file

#!/usr/bin/python3
"""Load YAML data to ensure validity."""
import os
import sys
import typing
from datetime import date, datetime, timedelta
from typing import Tuple, TypeVar, cast
import yaml
from geopy.distance import distance # type: ignore[import-untyped]
from rich.pretty import pprint
import agenda
import agenda.conference
import agenda.data
import agenda.travel
import agenda.trip
import agenda.types
config = __import__("config.default", fromlist=[""])
data_dir = config.PERSONAL_DATA
currencies = set(config.CURRENCIES + ["GBP"])
LatLon = Tuple[float, float]
def check_currency(item: agenda.types.StrDict) -> None:
"""Throw error if currency is not in config."""
currency = item.get("currency")
if not currency or currency in currencies:
return None
pprint(item)
print(f"currency {currency!r} not in {currencies!r}")
sys.exit(-1)
def get_coords(item: agenda.types.StrDict) -> LatLon | None:
"""Return latitude/longitude tuple when present."""
if "latitude" in item and "longitude" in item:
latitude = item["latitude"]
longitude = item["longitude"]
assert isinstance(latitude, (int, float))
assert isinstance(longitude, (int, float))
return (float(latitude), float(longitude))
return None
T = TypeVar("T")
def remove_nones(items: list[T | None]) -> list[T]:
"""Return a new list with None values removed."""
return [item for item in items if item is not None]
def distance_km(a: LatLon, b: LatLon) -> float:
"""Return the great-circle distance between two (lat, lon) points in km."""
return cast(float, distance(a, b).km)
def parse_datetime_value(value: typing.Any) -> datetime | None:
"""Return naive datetime for supported input types."""
if value is None:
return None
if isinstance(value, str):
try:
parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError as exc:
raise ValueError(f"Invalid ISO datetime string: {value}") from exc
return parsed.replace(tzinfo=None)
if isinstance(value, datetime):
return value.replace(tzinfo=None)
if isinstance(value, date):
return datetime.combine(value, datetime.min.time())
raise TypeError(f"Unsupported datetime value type: {type(value)}")
def ranges_overlap(
start_a: datetime, end_a: datetime, start_b: datetime, end_b: datetime
) -> bool:
"""Return True when two datetime ranges overlap."""
return start_a < end_b and start_b < end_a
def check_trips() -> None:
"""Check trips and ensure they are in chronological order."""
filepath = os.path.join(data_dir, "trips.yaml")
trips_data = yaml.safe_load(open(filepath, "r"))
prev_trip = None
prev_trip_data = None
for trip_data in trips_data:
current_trip = normalize_datetime(trip_data["trip"])
if prev_trip and current_trip < prev_trip:
assert prev_trip_data is not None
print("Out of order trip found:")
print(
f" Previous: {prev_trip_data.get('trip')} - "
+ f"{prev_trip_data.get('name', 'No name')}"
)
print(
f" Current: {trip_data.get('trip')} - "
+ f"{trip_data.get('name', 'No name')}"
)
assert False, "Trips are not in chronological order by trip date."
prev_trip = current_trip
prev_trip_data = trip_data
trip_list = agenda.trip.build_trip_list(data_dir)
print(len(trip_list), "trips")
for trip in trip_list:
if not trip.accommodation or not trip.conferences:
continue
accommodation_entries: list[
tuple[agenda.types.StrDict, LatLon, datetime, datetime]
] = []
for accommodation in trip.accommodation:
accommodation_coords = get_coords(accommodation)
if accommodation_coords is None:
continue
start_dt = parse_datetime_value(accommodation.get("from"))
end_dt = parse_datetime_value(accommodation.get("to"))
if start_dt is None or end_dt is None:
continue
accommodation_entries.append(
(accommodation, accommodation_coords, start_dt, end_dt)
)
if not accommodation_entries:
continue
for conference in trip.conferences:
if conference.get("online"):
continue
conference_coords = get_coords(conference)
if conference_coords is None:
continue
start_dt = parse_datetime_value(conference.get("start"))
end_value = conference.get("end") or conference.get("start")
end_dt = parse_datetime_value(end_value)
if start_dt is None or end_dt is None:
continue
conference_country = (
str(conference.get("country")).lower()
if conference.get("country")
else None
)
overlapping_distances = []
for (
accommodation_item,
accommodation_coords,
accommodation_start,
accommodation_end,
) in accommodation_entries:
accommodation_country = (
str(accommodation_item.get("country")).lower()
if accommodation_item.get("country")
else None
)
if (
conference_country
and accommodation_country
and accommodation_country != conference_country
):
continue
if not ranges_overlap(
accommodation_start, accommodation_end, start_dt, end_dt
):
continue
overlapping_distances.append(
distance_km(conference_coords, accommodation_coords)
)
if not overlapping_distances:
continue
assert min(overlapping_distances) < config.ACCOMODATION_MAX_DISTANCE_KM
coords, routes = agenda.trip.get_coordinates_and_routes(trip_list, data_dir)
print(len(coords), "coords")
print(len(routes), "routes")
def check_flights(airlines: set[str]) -> None:
"""Check flights and ensure they are in chronological order."""
bookings = agenda.travel.parse_yaml("flights", data_dir)
flight_count = 0
co2_flight_count = 0
prev_first_depart = None
for booking in bookings:
if "trip" not in booking:
pprint(booking)
assert "trip" in booking
assert all(flight["airline"] in airlines for flight in booking["flights"])
flight_count += len(booking["flights"])
co2_flight_count += len(
[flight for flight in booking["flights"] if "co2_kg" in flight]
)
for flight in booking["flights"]:
if "co2_kg" not in flight:
pprint(booking)
check_currency(booking)
if prev_first_depart:
assert (
booking["flights"][0]["depart"] > prev_first_depart
), "Bookings are not in chronological order by first flight's departure."
prev_first_depart = booking["flights"][0]["depart"]
print(
f"{len(bookings)} flight bookings, {flight_count} flights, "
f"{co2_flight_count} with CO2 numbers"
)
def normalize_datetime(dt_value: date | datetime) -> datetime:
"""Convert date or datetime to datetime for comparison, removing timezone info."""
if isinstance(dt_value, datetime):
return dt_value.replace(tzinfo=None)
if isinstance(dt_value, date):
return datetime.combine(dt_value, datetime.min.time())
raise TypeError(f"Unsupported datetime value type: {type(dt_value)}")
def check_trains() -> None:
"""Check trains and ensure they are in chronological order."""
trains = agenda.travel.parse_yaml("trains", data_dir)
prev_depart = None
prev_train = None
for train in trains:
current_depart = normalize_datetime(train["depart"])
if prev_depart and current_depart < prev_depart:
assert prev_train is not None
print(f"Out of order train found:")
print(
f" Previous: {prev_train.get('depart')} {prev_train.get('from', '')} -> {prev_train.get('to', '')}"
)
print(
f" Current: {train.get('depart')} {train.get('from', '')} -> {train.get('to', '')}"
)
assert False, "Trains are not in chronological order by departure time."
prev_depart = current_depart
prev_train = train
print(len(trains), "trains")
def check_conferences() -> None:
"""Check conferences and ensure they are in chronological order."""
filepath = os.path.join(data_dir, "conferences.yaml")
conferences_data = yaml.safe_load(open(filepath, "r"))
conferences = [agenda.conference.Conference(**conf) for conf in conferences_data]
prev_start = None
prev_conf_data = None
for i, conf_data in enumerate(conferences_data):
conf = conferences[i]
if not conf.currency or conf.currency in currencies:
pass
else:
pprint(conf)
print(f"currency {conf.currency!r} not in {currencies!r}")
sys.exit(-1)
current_start = normalize_datetime(conf_data["start"])
if prev_start and current_start < prev_start:
assert prev_conf_data is not None
print(f"Out of order conference found:")
print(
f" Previous: {prev_conf_data.get('start')} - {prev_conf_data.get('name', 'No name')}"
)
print(
f" Current: {conf_data.get('start')} - {conf_data.get('name', 'No name')}"
)
assert False, "Conferences are not in chronological order by start time."
prev_start = current_start
prev_conf_data = conf_data
print(len(conferences), "conferences")
def check_events() -> None:
"""Check events."""
today = date.today()
last_year = today - timedelta(days=365)
next_year = today + timedelta(days=2 * 365)
events = agenda.events_yaml.read(data_dir, last_year, next_year)
print(len(events), "events")
def check_coordinates(item: agenda.types.StrDict) -> None:
"""Check coordinate are valid."""
if "latitude" not in item and "longitude" not in item:
return
assert "latitude" in item and "longitude" in item
assert all(isinstance(item[key], (int, float)) for key in ("latitude", "longitude"))
def check_accommodation() -> None:
"""Check accommodation and ensure they are in chronological order."""
filepath = os.path.join(data_dir, "accommodation.yaml")
accommodation_list = yaml.safe_load(open(filepath))
required_fields = ["type", "name", "country", "location", "trip", "from", "to"]
prev_from = None
prev_stay = None
for stay in accommodation_list:
try:
assert all(field in stay for field in required_fields)
check_coordinates(stay)
except AssertionError:
pprint(stay)
raise
check_currency(stay)
current_from = normalize_datetime(stay["from"])
if prev_from and current_from < prev_from:
assert prev_stay is not None
print(f"Out of order accommodation found:")
print(
f" Previous: {prev_stay.get('from')} - {prev_stay.get('name', 'No name')} ({prev_stay.get('location', '')})"
)
print(
f" Current: {stay.get('from')} - {stay.get('name', 'No name')} ({stay.get('location', '')})"
)
assert (
False
), "Accommodation is not in chronological order by check-in time."
prev_from = current_from
prev_stay = stay
print(len(accommodation_list), "stays")
def check_airports() -> None:
"""Check airports."""
airports = typing.cast(
dict[str, agenda.types.StrDict], agenda.travel.parse_yaml("airports", data_dir)
)
print(len(airports), "airports")
for airport in airports.values():
assert "country" in airport
assert agenda.get_country(airport["country"])
def check_stations() -> None:
"""Check stations."""
stations = agenda.travel.parse_yaml("stations", data_dir)
print(len(stations), "stations")
for station in stations:
assert "country" in station
assert agenda.get_country(station["country"])
def check_ferries() -> None:
"""Check ferries and ensure they are in chronological order."""
ferries = agenda.travel.parse_yaml("ferries", data_dir)
prev_depart = None
prev_ferry = None
for ferry in ferries:
current_depart = normalize_datetime(ferry["depart"])
if prev_depart and current_depart < prev_depart:
assert prev_ferry is not None
print(f"Out of order ferry found:")
print(
f" Previous: {prev_ferry.get('depart')} {prev_ferry.get('from', '')} -> {prev_ferry.get('to', '')}"
)
print(
f" Current: {ferry.get('depart')} {ferry.get('from', '')} -> {ferry.get('to', '')}"
)
assert False, "Ferries are not in chronological order by departure time."
prev_depart = current_depart
prev_ferry = ferry
check_currency(ferry)
print(len(ferries), "ferries")
def check_airlines() -> list[agenda.types.StrDict]:
"""Check airlines."""
airlines = agenda.travel.parse_yaml("airlines", data_dir)
print(len(airlines), "airlines")
for airline in airlines:
try:
keys = set(airline.keys())
keys.discard("flight_number_prefer_icao")
assert keys == {"icao", "iata", "name"}
iata, icao = airline["iata"], airline["icao"]
assert iata[0].isupper() and iata[1].isupper() or iata[1].isdigit()
assert icao.isupper()
assert len(iata) == 2 and len(icao) == 3
if "flight_number_prefer_icao" in airline:
assert isinstance(airline["flight_number_prefer_icao"], bool)
except AssertionError:
print(yaml.dump([airline]))
raise
return airlines
def check() -> None:
"""Validate personal data YAML files."""
airlines = check_airlines()
check_trips()
check_flights({airline["iata"] for airline in airlines})
check_trains()
check_ferries()
check_conferences()
check_events()
check_accommodation()
check_airports()
check_stations()
if __name__ == "__main__":
check()