Consolidate to single GraphQL call; show indirect trains; fix price formatting

Replace two-step Eurostar fetch (HTML timetable + GraphQL prices) with a
single GraphQL call that returns timing, train numbers, prices, and seats.
Support indirect services (e.g. Amsterdam) by joining multi-leg train numbers
with ' + ' and keeping the earliest arrival per departure time.
Fix half-pound prices by casting displayPrice to float instead of int.
Wrap each train number segment in white-space:nowrap so 'ES 9132 + ER 9363'
never breaks mid-segment.
Format Eurostar prices with two decimal places.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-04-04 14:46:22 +01:00
parent 05eec29b7d
commit c22a3ea0fc
5 changed files with 182 additions and 320 deletions

View file

@ -1,29 +1,14 @@
"""
Scrape Eurostar timetable via httpx and fetch prices via the GraphQL API.
Fetch Eurostar timetable, prices, and seat availability via the GraphQL API.
Timetable: route-specific pages are Next.js SSR all departure data is
embedded in <script id="__NEXT_DATA__"> as JSON, so no browser / JS needed.
URL pattern:
https://www.eurostar.com/uk-en/travel-info/timetable/
{origin_id}/{dest_id}/{origin_slug}/{dest_slug}?date=YYYY-MM-DD
Data path: props.pageProps.pageData.liveDepartures[]
.origin.model.scheduledDepartureDateTime London departure
.destination.model.scheduledArrivalDateTime destination arrival
(already filtered to the requested stop, not the final stop)
Prices: POST https://site-api.eurostar.com/gateway (GraphQL, operationName
NewBookingSearch). The `journeys[].fares[]` array contains one entry per
class of service; we extract the Eurostar Standard (classOfService.code ==
"STANDARD") displayPrice for 1 adult, in GBP.
A single POST to https://site-api.eurostar.com/gateway (operationName
NewBookingSearch) returns departure time, arrival time, train number,
Eurostar Standard fare price, and seats remaining at that price for every
service on the requested date.
"""
import json
import random
import re
import string
import httpx
import requests
DEFAULT_UA = (
@ -32,93 +17,21 @@ DEFAULT_UA = (
)
ORIGIN_STATION_ID = '7015400'
ORIGIN_STATION_SLUG = 'london-st-pancras-intl'
TIMETABLE_BASE_URL = 'https://www.eurostar.com/uk-en/travel-info/timetable'
DESTINATION_STATION_IDS = {
'Paris Gare du Nord': '8727100',
'Brussels Midi': '8814001',
'Lille Europe': '8722326',
'Brussels Midi': '8814001',
'Lille Europe': '8722326',
'Amsterdam Centraal': '8400058',
'Rotterdam Centraal': '8400530',
}
def _slugify_station_name(name: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def search_url(destination: str, travel_date: str) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
return (
f'https://www.eurostar.com/search/uk-en'
f'?adult=1&origin={ORIGIN_STATION_ID}&destination={dest_id}&outbound={travel_date}'
)
def timetable_url(destination: str) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
dest_slug = _slugify_station_name(destination)
return (
f'{TIMETABLE_BASE_URL}/{ORIGIN_STATION_ID}/{dest_id}/'
f'{ORIGIN_STATION_SLUG}/{dest_slug}'
)
def _hhmm(dt_str: str | None) -> str | None:
"""'2026-03-30 09:34:00''09:34'"""
if not dt_str:
return None
m = re.search(r'(\d{2}):(\d{2}):\d{2}$', dt_str)
return f"{m.group(1)}:{m.group(2)}" if m else None
def _parse(html: str, destination: str) -> list[dict]:
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not m:
return []
data = json.loads(m.group(1))
departures = data['props']['pageProps']['pageData']['liveDepartures']
services = []
for dep in departures:
dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
if dep_time and arr_time:
carrier = dep.get('model', {}).get('carrier', 'ES')
number = dep.get('model', {}).get('trainNumber', '')
services.append({
'depart_st_pancras': dep_time,
'arrive_destination': arr_time,
'destination': destination,
'train_number': f"{carrier} {number}" if number else '',
})
return sorted(services, key=lambda s: s['depart_st_pancras'])
def fetch(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
url = timetable_url(destination)
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-GB,en;q=0.9',
}
with httpx.Client(headers=headers, follow_redirects=True, timeout=20) as client:
r = client.get(url, params={'date': travel_date})
r.raise_for_status()
return _parse(r.text, destination)
# ---------------------------------------------------------------------------
# Price fetching via site-api.eurostar.com GraphQL
# ---------------------------------------------------------------------------
_GATEWAY_URL = 'https://site-api.eurostar.com/gateway'
# Minimal query requesting only timing + Eurostar Standard fare price.
# Variable names and inline argument names match what the site sends so the
# Query requesting timing, train identity, and Standard fare price + seats.
# Variable names and argument names match the site's own query so the
# server-side query planner sees a familiar shape.
_GQL_PRICES = (
_GQL_QUERY = (
"query NewBookingSearch("
"$origin:String!,$destination:String!,$outbound:String!,"
"$currency:Currency!,$adult:Int,"
@ -141,74 +54,103 @@ _GQL_PRICES = (
" hideExternalCarrierTrains:true"
" hideDirectExternalCarrierTrains:true"
"){"
"timing{departureTime:departs __typename}"
"timing{departureTime:departs arrivalTime:arrives}"
"fares(filteredClassesOfService:$filteredClassesOfService){"
"classOfService{code __typename}"
"prices{displayPrice __typename}"
"seats __typename"
"classOfService{code}"
"prices{displayPrice}"
"seats "
"legs{serviceName serviceType{code}}"
"}"
"__typename"
"}"
"__typename"
"}"
"__typename"
"}"
"}"
)
def search_url(destination: str, travel_date: str) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
return (
f'https://www.eurostar.com/search/uk-en'
f'?adult=1&origin={ORIGIN_STATION_ID}&destination={dest_id}&outbound={travel_date}'
)
def _generate_cid() -> str:
chars = string.ascii_letters + string.digits
return 'SRCH-' + ''.join(random.choices(chars, k=22))
def fetch_prices(destination: str, travel_date: str) -> dict[str, dict]:
def _parse_graphql(data: dict, destination: str) -> list[dict]:
"""
Return Eurostar Standard price and seat availability for every departure on travel_date.
Parse a NewBookingSearch GraphQL response into a list of service dicts.
Result: {depart_st_pancras: {'price': int_or_None, 'seats': int_or_None}}
price is None when unavailable/not yet on sale; seats is the number of
Standard seats currently available for sale.
Each dict contains: depart_st_pancras, arrive_destination, destination,
train_number, price (float or None), seats (int or None).
The same St Pancras departure can appear multiple times (different
connecting trains); we keep the entry with the earliest arrival.
Multi-leg train numbers are joined with ' + ' (e.g. 'ES 9116 + ER 9329').
"""
best: dict[str, dict] = {}
journeys = data['data']['journeySearch']['outbound']['journeys']
for journey in journeys:
dep = journey['timing']['departureTime']
arr = journey['timing']['arrivalTime']
for fare in journey['fares']:
if fare['classOfService']['code'] == 'STANDARD':
p = fare.get('prices')
price = float(p['displayPrice']) if p and p.get('displayPrice') else None
seats = fare.get('seats')
legs = fare.get('legs') or []
train_number = ' + '.join(
f"{(leg.get('serviceType') or {}).get('code', 'ES')} {leg['serviceName']}"
for leg in legs if leg.get('serviceName')
)
if dep not in best or arr < best[dep]['arrive_destination']:
best[dep] = {
'depart_st_pancras': dep,
'arrive_destination': arr,
'destination': destination,
'train_number': train_number,
'price': price,
'seats': seats,
}
break
return sorted(best.values(), key=lambda s: s['depart_st_pancras'])
def fetch(destination: str, travel_date: str) -> list[dict]:
"""
Return all Eurostar services for destination on travel_date.
Each dict contains timetable info (depart_st_pancras, arrive_destination,
train_number) plus pricing (price, seats) from a single GraphQL call.
"""
dest_id = DESTINATION_STATION_IDS[destination]
headers = {
'User-Agent': DEFAULT_UA,
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language': 'en-GB',
'Referer': 'https://www.eurostar.com/',
'x-platform': 'web',
'x-market-code': 'uk',
'x-source-url': 'search-app/',
'cid': _generate_cid(),
'User-Agent': DEFAULT_UA,
'Content-Type': 'application/json',
'Accept': '*/*',
'Accept-Language':'en-GB',
'Referer': 'https://www.eurostar.com/',
'x-platform': 'web',
'x-market-code': 'uk',
'x-source-url': 'search-app/',
'cid': _generate_cid(),
}
payload = {
'operationName': 'NewBookingSearch',
'variables': {
'origin': ORIGIN_STATION_ID,
'destination': dest_id,
'outbound': travel_date,
'currency': 'GBP',
'adult': 1,
'filteredClassesOfService': ['STANDARD'],
'origin': ORIGIN_STATION_ID,
'destination': dest_id,
'outbound': travel_date,
'currency': 'GBP',
'adult': 1,
'filteredClassesOfService': ['STANDARD'],
},
'query': _GQL_PRICES,
'query': _GQL_QUERY,
}
resp = requests.post(_GATEWAY_URL, json=payload, headers=headers, timeout=20)
resp.raise_for_status()
data = resp.json()
prices: dict[str, dict] = {}
journeys = data['data']['journeySearch']['outbound']['journeys']
for journey in journeys:
dep = journey['timing']['departureTime']
price = None
seats = None
for fare in journey['fares']:
if fare['classOfService']['code'] == 'STANDARD':
p = fare.get('prices')
if p and p.get('displayPrice'):
price = int(p['displayPrice'])
seats = fare.get('seats')
break
prices[dep] = {'price': price, 'seats': seats}
return prices
return _parse_graphql(resp.json(), destination)