paddington-eurostar/scraper/eurostar.py
Edward Betts 13c4341f3a Add full type annotations and black formatting across all modules
Annotated all functions with mypy --strict-compatible types (-> None, dict[str,
Any], Generator types, etc.), added # type: ignore for untyped third-party libs
(lxml), and reformatted with black. All 18 source files now pass mypy --strict
with zero errors.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 21:48:53 +01:00

272 lines
9 KiB
Python

"""
Fetch Eurostar timetable, prices, and seat availability via the GraphQL API.
A single POST to https://site-api.eurostar.com/gateway (operationName
NewBookingSearch) returns departure time, arrival time, train number,
Eurostar Standard fare price, and seats remaining at that price for every
service on the requested date.
"""
import random
import string
from typing import Any
import requests
DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
ST_PANCRAS_STATION_ID = "7015400"
ORIGIN_STATION_ID = ST_PANCRAS_STATION_ID
DESTINATION_STATION_IDS = {
"Paris Gare du Nord": "8727100",
"Brussels Midi": "8814001",
"Lille Europe": "8722326",
"Amsterdam Centraal": "8400058",
"Rotterdam Centraal": "8400530",
"Cologne Hbf": "8015458",
}
_GATEWAY_URL = "https://site-api.eurostar.com/gateway"
# Query requesting timing, train identity, and Standard fare price + seats.
# Variable names and argument names match the site's own query so the
# server-side query planner sees a familiar shape.
_GQL_QUERY = (
"query NewBookingSearch("
"$origin:String!,$destination:String!,$outbound:String!,"
"$inbound:String,$currency:Currency!,$adult:Int,"
"$filteredClassesOfService:[ClassOfServiceEnum]"
"){"
"journeySearch("
"outboundDate:$outbound inboundDate:$inbound origin:$origin destination:$destination"
" adults:$adult currency:$currency"
' productFamilies:["PUB"] contractCode:"EIL_ALL"'
" adults16Plus:0 children:0 youths:0 children4Only:0 children5To11:0"
" infants:0 adultsWheelchair:0 childrenWheelchair:0 guideDogs:0"
" wheelchairCompanions:0 nonWheelchairCompanions:0"
" isAftersales:false multipleFlexibility:true showAllSummatedFares:false"
" seniorsAges:[] prioritiseShortHaulODTrains:true"
"){"
"outbound{"
"journeys("
"hideIndirectTrainsWhenDisruptedAndCancelled:false"
" hideDepartedTrains:true"
" hideExternalCarrierTrains:true"
" hideDirectExternalCarrierTrains:true"
"){"
"timing{departureTime:departs arrivalTime:arrives}"
"fares(filteredClassesOfService:$filteredClassesOfService){"
"classOfService{code}"
"prices{displayPrice}"
"seats "
"legs{serviceName serviceType{code}}"
"}"
"}"
"}"
"inbound{"
"journeys("
"hideIndirectTrainsWhenDisruptedAndCancelled:false"
" hideDepartedTrains:true"
" hideExternalCarrierTrains:true"
" hideDirectExternalCarrierTrains:true"
"){"
"timing{departureTime:departs arrivalTime:arrives}"
"fares(filteredClassesOfService:$filteredClassesOfService){"
"classOfService{code}"
"prices{displayPrice}"
"seats "
"legs{serviceName serviceType{code}}"
"}"
"}"
"}"
"}"
"}"
)
_STANDARD = "STANDARD"
_STANDARD_PLUS = "PLUS"
def search_url(
destination: str,
travel_date: str,
direction: str = "outbound",
return_date: str | None = None,
) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
origin = ST_PANCRAS_STATION_ID
destination_id = dest_id
outbound = travel_date
inbound = return_date
if direction == "inbound":
origin, destination_id = dest_id, ST_PANCRAS_STATION_ID
inbound = None
return (
f"https://www.eurostar.com/search/uk-en"
f"?adult=1&origin={origin}&destination={destination_id}&outbound={outbound}"
+ (f"&inbound={inbound}" if inbound else "")
)
def _generate_cid() -> str:
chars = string.ascii_letters + string.digits
return "SRCH-" + "".join(random.choices(chars, k=22))
def _parse_journeys(
journeys: list[dict[str, Any]], destination: str, direction: str
) -> list[dict[str, Any]]:
"""
Parse a NewBookingSearch GraphQL response into a list of service dicts.
Each dict contains: depart_st_pancras, arrive_destination, destination,
train_number, price/seats (Standard), plus_price/plus_seats (Standard Premier).
The same St Pancras departure can appear multiple times (different
connecting trains); we keep the entry with the earliest arrival.
Multi-leg train numbers are joined with ' + ' (e.g. 'ES 9116 + ER 9329').
"""
best: dict[str, dict[str, Any]] = {}
for journey in journeys:
dep = journey["timing"]["departureTime"]
arr = journey["timing"]["arrivalTime"]
std_price = std_seats = plus_price = plus_seats = None
train_number = ""
for fare in journey.get("fares") or []:
cos = fare["classOfService"]["code"]
p = fare.get("prices")
price = float(p["displayPrice"]) if p and p.get("displayPrice") else None
seats = fare.get("seats")
if not train_number:
legs = fare.get("legs") or []
train_number = " + ".join(
f"{(leg.get('serviceType') or {}).get('code', 'ES')} {leg['serviceName']}"
for leg in legs
if leg.get("serviceName")
)
if cos == _STANDARD:
std_price, std_seats = price, seats
elif cos == _STANDARD_PLUS:
plus_price, plus_seats = price, seats
if direction == "inbound":
service = {
"depart_destination": dep,
"arrive_st_pancras": arr,
"destination": destination,
"train_number": train_number,
"price": std_price,
"seats": std_seats,
"plus_price": plus_price,
"plus_seats": plus_seats,
}
key = dep
arrive_key = "arrive_st_pancras"
else:
service = {
"depart_st_pancras": dep,
"arrive_destination": arr,
"destination": destination,
"train_number": train_number,
"price": std_price,
"seats": std_seats,
"plus_price": plus_price,
"plus_seats": plus_seats,
}
key = dep
arrive_key = "arrive_destination"
if key not in best or arr < best[key][arrive_key]:
best[key] = service
sort_key = "depart_destination" if direction == "inbound" else "depart_st_pancras"
return sorted(best.values(), key=lambda s: s[sort_key])
def _parse_graphql(data: dict[str, Any], destination: str) -> list[dict[str, Any]]:
journeys = data["data"]["journeySearch"]["outbound"]["journeys"]
return _parse_journeys(journeys, destination, "outbound")
def _parse_graphql_leg(
data: dict[str, Any], destination: str, leg: str, direction: str
) -> list[dict[str, Any]]:
journeys = data["data"]["journeySearch"][leg]["journeys"]
return _parse_journeys(journeys, destination, direction)
def _payload(
origin: str, destination_id: str, outbound: str, inbound: str | None = None
) -> dict[str, Any]:
variables: dict[str, Any] = {
"origin": origin,
"destination": destination_id,
"outbound": outbound,
"inbound": inbound,
"currency": "GBP",
"adult": 1,
"filteredClassesOfService": [_STANDARD, _STANDARD_PLUS],
}
return {
"operationName": "NewBookingSearch",
"variables": variables,
"query": _GQL_QUERY,
}
def _headers() -> dict[str, str]:
return {
"User-Agent": DEFAULT_UA,
"Content-Type": "application/json",
"Accept": "*/*",
"Accept-Language": "en-GB",
"Referer": "https://www.eurostar.com/",
"x-platform": "web",
"x-market-code": "uk",
"x-source-url": "search-app/",
"cid": _generate_cid(),
}
def fetch(
destination: str, travel_date: str, direction: str = "outbound"
) -> list[dict[str, Any]]:
"""
Return all Eurostar services for destination on travel_date.
Each dict contains timetable info (depart_st_pancras, arrive_destination,
train_number) plus pricing (price, seats) from a single GraphQL call.
"""
dest_id = DESTINATION_STATION_IDS[destination]
if direction == "inbound":
origin, destination_id = dest_id, ST_PANCRAS_STATION_ID
else:
origin, destination_id = ST_PANCRAS_STATION_ID, dest_id
resp = requests.post(
_GATEWAY_URL,
json=_payload(origin, destination_id, travel_date),
headers=_headers(),
timeout=20,
)
resp.raise_for_status()
leg_direction = "inbound" if direction == "inbound" else "outbound"
return _parse_graphql_leg(resp.json(), destination, "outbound", leg_direction)
def fetch_return(
destination: str, outbound_date: str, return_date: str
) -> dict[str, list[dict[str, Any]]]:
dest_id = DESTINATION_STATION_IDS[destination]
resp = requests.post(
_GATEWAY_URL,
json=_payload(ST_PANCRAS_STATION_ID, dest_id, outbound_date, return_date),
headers=_headers(),
timeout=20,
)
resp.raise_for_status()
data = resp.json()
return {
"outbound": _parse_graphql_leg(data, destination, "outbound", "outbound"),
"inbound": _parse_graphql_leg(data, destination, "inbound", "inbound"),
}