Fetches prices via the site-api.eurostar.com GraphQL gateway (NewBookingSearch operation, discovered with Playwright). Adds fetch_prices() to scraper/eurostar.py using requests, caches results, annotates each trip with eurostar_price and total_price, and shows an ES Std column plus total cost (duration + price) in the results table. The Transfer column is hidden on small screens for mobile usability. Closes #4 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
203 lines
7 KiB
Python
203 lines
7 KiB
Python
"""
|
|
Scrape Eurostar timetable via httpx and fetch prices via the GraphQL API.
|
|
|
|
Timetable: route-specific pages are Next.js SSR — all departure data is
|
|
embedded in <script id="__NEXT_DATA__"> as JSON, so no browser / JS needed.
|
|
|
|
URL pattern:
|
|
https://www.eurostar.com/uk-en/travel-info/timetable/
|
|
{origin_id}/{dest_id}/{origin_slug}/{dest_slug}?date=YYYY-MM-DD
|
|
|
|
Data path: props.pageProps.pageData.liveDepartures[]
|
|
.origin.model.scheduledDepartureDateTime → London departure
|
|
.destination.model.scheduledArrivalDateTime → destination arrival
|
|
(already filtered to the requested stop, not the final stop)
|
|
|
|
Prices: POST https://site-api.eurostar.com/gateway (GraphQL, operationName
|
|
NewBookingSearch). The `journeys[].fares[]` array contains one entry per
|
|
class of service; we extract the Eurostar Standard (classOfService.code ==
|
|
"STANDARD") displayPrice for 1 adult, in GBP.
|
|
"""
|
|
import json
|
|
import random
|
|
import re
|
|
import string
|
|
|
|
import httpx
|
|
import requests
|
|
|
|
DEFAULT_UA = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
ORIGIN_STATION_ID = '7015400'
|
|
ORIGIN_STATION_SLUG = 'london-st-pancras-intl'
|
|
TIMETABLE_BASE_URL = 'https://www.eurostar.com/uk-en/travel-info/timetable'
|
|
|
|
DESTINATION_STATION_IDS = {
|
|
'Paris Gare du Nord': '8727100',
|
|
'Brussels Midi': '8814001',
|
|
'Lille Europe': '8722326',
|
|
'Amsterdam Centraal': '8400058',
|
|
'Rotterdam Centraal': '8400530',
|
|
}
|
|
|
|
|
|
def _slugify_station_name(name: str) -> str:
|
|
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
|
|
|
|
def timetable_url(destination: str) -> str:
|
|
dest_id = DESTINATION_STATION_IDS[destination]
|
|
dest_slug = _slugify_station_name(destination)
|
|
return (
|
|
f'{TIMETABLE_BASE_URL}/{ORIGIN_STATION_ID}/{dest_id}/'
|
|
f'{ORIGIN_STATION_SLUG}/{dest_slug}'
|
|
)
|
|
|
|
|
|
def _hhmm(dt_str: str | None) -> str | None:
|
|
"""'2026-03-30 09:34:00' → '09:34'"""
|
|
if not dt_str:
|
|
return None
|
|
m = re.search(r'(\d{2}):(\d{2}):\d{2}$', dt_str)
|
|
return f"{m.group(1)}:{m.group(2)}" if m else None
|
|
|
|
|
|
def _parse(html: str, destination: str) -> list[dict]:
|
|
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
if not m:
|
|
return []
|
|
data = json.loads(m.group(1))
|
|
departures = data['props']['pageProps']['pageData']['liveDepartures']
|
|
services = []
|
|
for dep in departures:
|
|
dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
|
|
arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
|
|
if dep_time and arr_time:
|
|
carrier = dep.get('model', {}).get('carrier', 'ES')
|
|
number = dep.get('model', {}).get('trainNumber', '')
|
|
services.append({
|
|
'depart_st_pancras': dep_time,
|
|
'arrive_destination': arr_time,
|
|
'destination': destination,
|
|
'train_number': f"{carrier} {number}" if number else '',
|
|
})
|
|
return sorted(services, key=lambda s: s['depart_st_pancras'])
|
|
|
|
|
|
def fetch(destination: str, travel_date: str,
|
|
user_agent: str = DEFAULT_UA) -> list[dict]:
|
|
url = timetable_url(destination)
|
|
headers = {
|
|
'User-Agent': user_agent,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-GB,en;q=0.9',
|
|
}
|
|
with httpx.Client(headers=headers, follow_redirects=True, timeout=20) as client:
|
|
r = client.get(url, params={'date': travel_date})
|
|
r.raise_for_status()
|
|
return _parse(r.text, destination)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Price fetching via site-api.eurostar.com GraphQL
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_GATEWAY_URL = 'https://site-api.eurostar.com/gateway'
|
|
|
|
# Minimal query requesting only timing + Eurostar Standard fare price.
|
|
# Variable names and inline argument names match what the site sends so the
|
|
# server-side query planner sees a familiar shape.
|
|
_GQL_PRICES = (
|
|
"query NewBookingSearch("
|
|
"$origin:String!,$destination:String!,$outbound:String!,"
|
|
"$currency:Currency!,$adult:Int,"
|
|
"$filteredClassesOfService:[ClassOfServiceEnum]"
|
|
"){"
|
|
"journeySearch("
|
|
"outboundDate:$outbound origin:$origin destination:$destination"
|
|
" adults:$adult currency:$currency"
|
|
" productFamilies:[\"PUB\"] contractCode:\"EIL_ALL\""
|
|
" adults16Plus:0 children:0 youths:0 children4Only:0 children5To11:0"
|
|
" infants:0 adultsWheelchair:0 childrenWheelchair:0 guideDogs:0"
|
|
" wheelchairCompanions:0 nonWheelchairCompanions:0"
|
|
" isAftersales:false multipleFlexibility:true showAllSummatedFares:false"
|
|
" seniorsAges:[] prioritiseShortHaulODTrains:true"
|
|
"){"
|
|
"outbound{"
|
|
"journeys("
|
|
"hideIndirectTrainsWhenDisruptedAndCancelled:false"
|
|
" hideDepartedTrains:true"
|
|
" hideExternalCarrierTrains:true"
|
|
" hideDirectExternalCarrierTrains:true"
|
|
"){"
|
|
"timing{departureTime:departs __typename}"
|
|
"fares(filteredClassesOfService:$filteredClassesOfService){"
|
|
"classOfService{code __typename}"
|
|
"prices{displayPrice __typename}"
|
|
"seats __typename"
|
|
"}"
|
|
"__typename"
|
|
"}"
|
|
"__typename"
|
|
"}"
|
|
"__typename"
|
|
"}"
|
|
"}"
|
|
)
|
|
|
|
|
|
def _generate_cid() -> str:
|
|
chars = string.ascii_letters + string.digits
|
|
return 'SRCH-' + ''.join(random.choices(chars, k=22))
|
|
|
|
|
|
def fetch_prices(destination: str, travel_date: str) -> dict[str, int | None]:
|
|
"""
|
|
Return Eurostar Standard prices for every departure on travel_date.
|
|
|
|
Result: {depart_st_pancras: price_gbp_int_or_None}
|
|
None means the class is sold out or unavailable for that departure.
|
|
"""
|
|
dest_id = DESTINATION_STATION_IDS[destination]
|
|
headers = {
|
|
'User-Agent': DEFAULT_UA,
|
|
'Content-Type': 'application/json',
|
|
'Accept': '*/*',
|
|
'Accept-Language': 'en-GB',
|
|
'Referer': 'https://www.eurostar.com/',
|
|
'x-platform': 'web',
|
|
'x-market-code': 'uk',
|
|
'x-source-url': 'search-app/',
|
|
'cid': _generate_cid(),
|
|
}
|
|
payload = {
|
|
'operationName': 'NewBookingSearch',
|
|
'variables': {
|
|
'origin': ORIGIN_STATION_ID,
|
|
'destination': dest_id,
|
|
'outbound': travel_date,
|
|
'currency': 'GBP',
|
|
'adult': 1,
|
|
'filteredClassesOfService': ['STANDARD'],
|
|
},
|
|
'query': _GQL_PRICES,
|
|
}
|
|
resp = requests.post(_GATEWAY_URL, json=payload, headers=headers, timeout=20)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
prices: dict[str, int | None] = {}
|
|
journeys = data['data']['journeySearch']['outbound']['journeys']
|
|
for journey in journeys:
|
|
dep = journey['timing']['departureTime']
|
|
price = None
|
|
for fare in journey['fares']:
|
|
if fare['classOfService']['code'] == 'STANDARD':
|
|
p = fare.get('prices')
|
|
if p and p.get('displayPrice'):
|
|
price = int(p['displayPrice'])
|
|
break
|
|
prices[dep] = price
|
|
return prices
|