paddington-eurostar/scraper/eurostar.py
Edward Betts 89a536dfd3 Add Eurostar Plus prices and NR advance fare support
- Eurostar scraper now fetches both Standard and Plus (PLUS class code)
  prices/seats in a single API call; each service dict gains plus_price
  and plus_seats fields
- GWR fares scraper gains fetch_advance() which makes two sets of
  paginated calls (standard advance + first-class advance) and returns
  cheapest per departure; shared _run_pages() generator reduces
  duplication in fetch()
- New /api/advance_fares/<station_crs>/<travel_date> endpoint returns
  advance fares as JSON, cached for 24 hours
- Results page gains NR ticket selector (Walk-on / Std Advance / 1st
  Advance) and Eurostar selector (Standard / Plus); total column is
  JS-computed from the selected combination with cheapest/priciest
  highlighting
- Load advance prices button fetches the API lazily; if advance fares
  are already cached they are embedded in the page and applied on load
  so the button is hidden automatically

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 16:22:24 +01:00

168 lines
6.1 KiB
Python

"""
Fetch Eurostar timetable, prices, and seat availability via the GraphQL API.
A single POST to https://site-api.eurostar.com/gateway (operationName
NewBookingSearch) returns departure time, arrival time, train number,
Eurostar Standard fare price, and seats remaining at that price for every
service on the requested date.
"""
import random
import string
import requests
DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
ORIGIN_STATION_ID = '7015400'
DESTINATION_STATION_IDS = {
'Paris Gare du Nord': '8727100',
'Brussels Midi': '8814001',
'Lille Europe': '8722326',
'Amsterdam Centraal': '8400058',
'Rotterdam Centraal': '8400530',
'Cologne Hbf': '8015458',
}
_GATEWAY_URL = 'https://site-api.eurostar.com/gateway'
# Query requesting timing, train identity, and Standard fare price + seats.
# Variable names and argument names match the site's own query so the
# server-side query planner sees a familiar shape.
_GQL_QUERY = (
"query NewBookingSearch("
"$origin:String!,$destination:String!,$outbound:String!,"
"$currency:Currency!,$adult:Int,"
"$filteredClassesOfService:[ClassOfServiceEnum]"
"){"
"journeySearch("
"outboundDate:$outbound origin:$origin destination:$destination"
" adults:$adult currency:$currency"
" productFamilies:[\"PUB\"] contractCode:\"EIL_ALL\""
" adults16Plus:0 children:0 youths:0 children4Only:0 children5To11:0"
" infants:0 adultsWheelchair:0 childrenWheelchair:0 guideDogs:0"
" wheelchairCompanions:0 nonWheelchairCompanions:0"
" isAftersales:false multipleFlexibility:true showAllSummatedFares:false"
" seniorsAges:[] prioritiseShortHaulODTrains:true"
"){"
"outbound{"
"journeys("
"hideIndirectTrainsWhenDisruptedAndCancelled:false"
" hideDepartedTrains:true"
" hideExternalCarrierTrains:true"
" hideDirectExternalCarrierTrains:true"
"){"
"timing{departureTime:departs arrivalTime:arrives}"
"fares(filteredClassesOfService:$filteredClassesOfService){"
"classOfService{code}"
"prices{displayPrice}"
"seats "
"legs{serviceName serviceType{code}}"
"}"
"}"
"}"
"}"
"}"
)
_STANDARD = 'STANDARD'
_STANDARD_PLUS = 'PLUS'
def search_url(destination: str, travel_date: str) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
return (
f'https://www.eurostar.com/search/uk-en'
f'?adult=1&origin={ORIGIN_STATION_ID}&destination={dest_id}&outbound={travel_date}'
)
def _generate_cid() -> str:
chars = string.ascii_letters + string.digits
return 'SRCH-' + ''.join(random.choices(chars, k=22))
def _parse_graphql(data: dict, destination: str) -> list[dict]:
"""
Parse a NewBookingSearch GraphQL response into a list of service dicts.
Each dict contains: depart_st_pancras, arrive_destination, destination,
train_number, price/seats (Standard), plus_price/plus_seats (Standard Premier).
The same St Pancras departure can appear multiple times (different
connecting trains); we keep the entry with the earliest arrival.
Multi-leg train numbers are joined with ' + ' (e.g. 'ES 9116 + ER 9329').
"""
best: dict[str, dict] = {}
journeys = data['data']['journeySearch']['outbound']['journeys']
for journey in journeys:
dep = journey['timing']['departureTime']
arr = journey['timing']['arrivalTime']
std_price = std_seats = plus_price = plus_seats = None
train_number = ''
for fare in (journey.get('fares') or []):
cos = fare['classOfService']['code']
p = fare.get('prices')
price = float(p['displayPrice']) if p and p.get('displayPrice') else None
seats = fare.get('seats')
if not train_number:
legs = fare.get('legs') or []
train_number = ' + '.join(
f"{(leg.get('serviceType') or {}).get('code', 'ES')} {leg['serviceName']}"
for leg in legs if leg.get('serviceName')
)
if cos == _STANDARD:
std_price, std_seats = price, seats
elif cos == _STANDARD_PLUS:
plus_price, plus_seats = price, seats
if dep not in best or arr < best[dep]['arrive_destination']:
best[dep] = {
'depart_st_pancras': dep,
'arrive_destination': arr,
'destination': destination,
'train_number': train_number,
'price': std_price,
'seats': std_seats,
'plus_price': plus_price,
'plus_seats': plus_seats,
}
return sorted(best.values(), key=lambda s: s['depart_st_pancras'])
def fetch(destination: str, travel_date: str) -> list[dict]:
"""
Return all Eurostar services for destination on travel_date.
Each dict contains timetable info (depart_st_pancras, arrive_destination,
train_number) plus pricing (price, seats) from a single GraphQL call.
"""
dest_id = DESTINATION_STATION_IDS[destination]
headers = {
'User-Agent': DEFAULT_UA,
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language':'en-GB',
'Referer': 'https://www.eurostar.com/',
'x-platform': 'web',
'x-market-code': 'uk',
'x-source-url': 'search-app/',
'cid': _generate_cid(),
}
payload = {
'operationName': 'NewBookingSearch',
'variables': {
'origin': ORIGIN_STATION_ID,
'destination': dest_id,
'outbound': travel_date,
'currency': 'GBP',
'adult': 1,
'filteredClassesOfService': [_STANDARD, _STANDARD_PLUS],
},
'query': _GQL_QUERY,
}
resp = requests.post(_GATEWAY_URL, json=payload, headers=headers, timeout=20)
resp.raise_for_status()
return _parse_graphql(resp.json(), destination)