paddington-eurostar/scraper/gwr_fares.py
Edward Betts 13c4341f3a Add full type annotations and black formatting across all modules
Annotated all functions with mypy --strict-compatible types (-> None, dict[str,
Any], Generator types, etc.), added # type: ignore for untyped third-party libs
(lxml), and reformatted with black. All 18 source files now pass mypy --strict
with zero errors.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 21:48:53 +01:00

336 lines
12 KiB
Python

"""
Fetch GWR walk-on single fares from any station to London Paddington.
Uses the GWR journey search API (same API as www.gwr.com ticket search).
Returns per-train cheapest standard-class fare with restrictions already applied.
Cache for 30 days — fares rarely change.
"""
from typing import Any, Generator
import httpx
_API_URL = "https://api.gwr.com/api/shopping/journeysearch"
# API key is embedded in the GWR web app (appvalues.prod.json)
_API_KEY = "OgovGqAlLp4gWAhL7DQLo7pMCt8GHi2U4SPFiZgG"
_PAD_CODE = "GBQQP" # London Paddington cluster code as used by GWR website
_WALKON_CODES = {"SSS", "SVS", "SDS", "CDS"}
_MAX_PAGES = 20
def _headers() -> dict[str, str]:
return {
"user-agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
),
"accept": "application/json, text/plain, */*",
"channel": "WEB",
"content-type": "application/json",
"apikey": _API_KEY,
"origin": "https://www.gwr.com",
"referer": "https://www.gwr.com/",
}
def _request_body(
from_code: str,
to_code: str,
travel_date: str,
conversation_token: str | None,
later: bool,
) -> dict[str, Any]:
return {
"IsNextOutward": False,
"IsPreviousOutward": False,
"IsNextReturn": False,
"IsPreviousReturn": False,
"campaignCode": "",
"validationCode": "",
"locfrom": from_code,
"locto": to_code,
"datetimedepart": f"{travel_date}T00:00:00",
"outwarddepartafter": True,
"datetimereturn": None,
"returndepartafter": False,
"directServicesOnly": False,
"firstclass": False,
"standardclass": True,
"adults": 1,
"children": 0,
"openreturn": False,
"via": None,
"avoid": None,
"isEarlierSearch": False,
"isLaterSearch": later,
"isEarlierSearchReturn": False,
"isLaterSearchReturn": False,
"railcards": [],
"conversationToken": conversation_token,
}
def _station_code(station_crs: str) -> str:
return f"GB{station_crs}"
def _od_codes(station_crs: str, direction: str) -> tuple[str, str]:
if direction == "from_paddington":
return _PAD_CODE, _station_code(station_crs)
return _station_code(station_crs), _PAD_CODE
def _run_pages(
station_crs: str,
travel_date: str,
first_class: bool = False,
direction: str = "to_paddington",
) -> Generator[tuple[str, list[Any]], None, None]:
"""
Iterate all pages of GWR journey search results.
Yields (dep_time, fares_list) for each unique departure time seen.
first_class=True switches the request to first class fares.
"""
seen: set[str] = set()
with httpx.Client(headers=_headers(), timeout=30) as client:
conversation_token = None
later = False
from_code, to_code = _od_codes(station_crs, direction)
for _ in range(_MAX_PAGES):
body = _request_body(
from_code, to_code, travel_date, conversation_token, later
)
if first_class:
body["firstclass"] = True
body["standardclass"] = False
resp = client.post(_API_URL, json=body)
resp.raise_for_status()
data = resp.json().get("data") or {}
conversation_token = data.get("conversationToken")
for journey in data.get("outwardOpenPureReturnFare", []):
dep_iso = journey.get("departureTime", "")
dep_time = dep_iso[11:16] # "HH:MM" from "2026-04-10T09:08:00"
if not dep_time or dep_time in seen:
continue
seen.add(dep_time)
yield dep_time, journey.get("journeyFareDetails", [])
if not data.get("showLaterOutward", False):
break
later = True
def _run_pages_batched(
station_crs: str,
travel_date: str,
first_class: bool = False,
direction: str = "to_paddington",
) -> Generator[list[tuple[str, list[Any]]], None, None]:
"""
Like _run_pages but yields one list of (dep_time, fares_list) per API page call,
allowing callers to stream results a page at a time.
"""
seen: set[str] = set()
with httpx.Client(headers=_headers(), timeout=30) as client:
conversation_token = None
later = False
from_code, to_code = _od_codes(station_crs, direction)
for _ in range(_MAX_PAGES):
body = _request_body(
from_code, to_code, travel_date, conversation_token, later
)
if first_class:
body["firstclass"] = True
body["standardclass"] = False
resp = client.post(_API_URL, json=body)
resp.raise_for_status()
data = resp.json().get("data") or {}
conversation_token = data.get("conversationToken")
batch = []
for journey in data.get("outwardOpenPureReturnFare", []):
dep_iso = journey.get("departureTime", "")
dep_time = dep_iso[11:16]
if not dep_time or dep_time in seen:
continue
seen.add(dep_time)
batch.append((dep_time, journey.get("journeyFareDetails", [])))
if batch:
yield batch
if not data.get("showLaterOutward", False):
break
later = True
def fetch(
station_crs: str, travel_date: str, direction: str = "to_paddington"
) -> dict[str, dict[str, Any]]:
"""
Fetch GWR walk-on single fares for the selected Paddington direction.
Returns {departure_time: {'ticket': name, 'price': float, 'code': code}}
where price is in £ and only the cheapest available standard-class walk-on
ticket per departure (with restrictions already applied by GWR) is kept.
"""
result: dict[str, dict[str, Any]] = {}
for dep_time, fares in _run_pages(station_crs, travel_date, direction=direction):
cheapest = None
for fare in fares:
code = fare.get("ticketTypeCode")
if code not in _WALKON_CODES:
continue
if not fare.get("isStandardClass"):
continue
price_pence = fare.get("fare", 0)
if cheapest is None or price_pence < cheapest["price_pence"]:
cheapest = {
"ticket": fare.get("ticketType", ""),
"price": price_pence / 100,
"price_pence": price_pence,
"code": code,
}
if cheapest:
result[dep_time] = {
"ticket": cheapest["ticket"],
"price": cheapest["price"],
"code": cheapest["code"],
}
return result
def fetch_advance(
station_crs: str, travel_date: str, direction: str = "to_paddington"
) -> dict[str, dict[str, Any]]:
"""
Fetch advance fares: cheapest standard advance and first-class advance per departure.
Makes two sets of paginated API calls (standard class, then first class).
Returns {departure_time: {'advance_std': dict or None, 'advance_1st': dict or None}}
where each sub-dict has keys 'ticket', 'price', 'code'.
"""
std_advance: dict[str, dict[str, Any]] = {}
for dep_time, fares in _run_pages(
station_crs, travel_date, first_class=False, direction=direction
):
cheapest = None
for fare in fares:
code = fare.get("ticketTypeCode")
if code in _WALKON_CODES:
continue # skip walk-on fares
if not fare.get("isStandardClass"):
continue
price_pence = fare.get("fare", 0)
if cheapest is None or price_pence < cheapest["price_pence"]:
cheapest = {
"ticket": fare.get("ticketType", ""),
"price": price_pence / 100,
"price_pence": price_pence,
"code": code,
}
if cheapest:
std_advance[dep_time] = {
"ticket": cheapest["ticket"],
"price": cheapest["price"],
"code": cheapest["code"],
}
first_advance: dict[str, dict[str, Any]] = {}
for dep_time, fares in _run_pages(
station_crs, travel_date, first_class=True, direction=direction
):
cheapest = None
for fare in fares:
price_pence = fare.get("fare", 0)
if cheapest is None or price_pence < cheapest["price_pence"]:
cheapest = {
"ticket": fare.get("ticketType", ""),
"price": price_pence / 100,
"price_pence": price_pence,
"code": fare.get("ticketTypeCode"),
}
if cheapest:
first_advance[dep_time] = {
"ticket": cheapest["ticket"],
"price": cheapest["price"],
"code": cheapest["code"],
}
all_times = set(std_advance) | set(first_advance)
return {
t: {
"advance_std": std_advance.get(t),
"advance_1st": first_advance.get(t),
}
for t in all_times
}
def fetch_advance_streaming(
station_crs: str, travel_date: str, direction: str = "to_paddington"
) -> Generator[dict[str, dict[str, Any]], None, None]:
"""
Generator yielding partial advance fare dicts one GWR API page at a time.
Each yield is {dep_time: {'advance_std': dict|None, 'advance_1st': dict|None}}.
Two passes are made (standard class then first class); each page of results is
yielded immediately so callers can stream prices to clients as they arrive.
"""
# Pass 1: standard class advance fares
for batch in _run_pages_batched(
station_crs, travel_date, first_class=False, direction=direction
):
page: dict[str, dict[str, Any]] = {}
for dep_time, fares in batch:
cheapest = None
for fare in fares:
code = fare.get("ticketTypeCode")
if code in _WALKON_CODES:
continue
if not fare.get("isStandardClass"):
continue
price_pence = fare.get("fare", 0)
if cheapest is None or price_pence < cheapest["price_pence"]:
cheapest = {
"ticket": fare.get("ticketType", ""),
"price": price_pence / 100,
"price_pence": price_pence,
"code": code,
}
if cheapest:
page[dep_time] = {
"advance_std": {
"ticket": cheapest["ticket"],
"price": cheapest["price"],
"code": cheapest["code"],
},
"advance_1st": None,
}
if page:
yield page
# Pass 2: first class advance fares
for batch in _run_pages_batched(
station_crs, travel_date, first_class=True, direction=direction
):
page = {}
for dep_time, fares in batch:
cheapest = None
for fare in fares:
price_pence = fare.get("fare", 0)
if cheapest is None or price_pence < cheapest["price_pence"]:
cheapest = {
"ticket": fare.get("ticketType", ""),
"price": price_pence / 100,
"price_pence": price_pence,
"code": fare.get("ticketTypeCode"),
}
if cheapest:
page[dep_time] = {
"advance_std": None,
"advance_1st": {
"ticket": cheapest["ticket"],
"price": cheapest["price"],
"code": cheapest["code"],
},
}
if page:
yield page