bristol-eurostar/scraper/eurostar.py

92 lines
3.2 KiB
Python

"""
Scrape Eurostar timetable via httpx.
The route-specific timetable pages are Next.js SSR — all departure data is
embedded in <script id="__NEXT_DATA__"> as JSON, so no browser / JS needed.
URL pattern:
https://www.eurostar.com/uk-en/travel-info/timetable/
{origin_id}/{dest_id}/{origin_slug}/{dest_slug}?date=YYYY-MM-DD
Data path: props.pageProps.pageData.liveDepartures[]
.origin.model.scheduledDepartureDateTime → London departure
.destination.model.scheduledArrivalDateTime → destination arrival
(already filtered to the requested stop, not the final stop)
"""
import json
import re
import httpx
DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
ORIGIN_STATION_ID = '7015400'
ORIGIN_STATION_SLUG = 'london-st-pancras-intl'
TIMETABLE_BASE_URL = 'https://www.eurostar.com/uk-en/travel-info/timetable'
DESTINATION_STATION_IDS = {
'Paris Gare du Nord': '8727100',
'Brussels Midi': '8814001',
'Lille Europe': '8722326',
'Amsterdam Centraal': '8400058',
'Rotterdam Centraal': '8400530',
}
def _slugify_station_name(name: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def timetable_url(destination: str) -> str:
dest_id = DESTINATION_STATION_IDS[destination]
dest_slug = _slugify_station_name(destination)
return (
f'{TIMETABLE_BASE_URL}/{ORIGIN_STATION_ID}/{dest_id}/'
f'{ORIGIN_STATION_SLUG}/{dest_slug}'
)
def _hhmm(dt_str: str | None) -> str | None:
"""'2026-03-30 09:34:00''09:34'"""
if not dt_str:
return None
m = re.search(r'(\d{2}):(\d{2}):\d{2}$', dt_str)
return f"{m.group(1)}:{m.group(2)}" if m else None
def _parse(html: str, destination: str) -> list[dict]:
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not m:
return []
data = json.loads(m.group(1))
departures = data['props']['pageProps']['pageData']['liveDepartures']
services = []
for dep in departures:
dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
if dep_time and arr_time:
carrier = dep.get('model', {}).get('carrier', 'ES')
number = dep.get('model', {}).get('trainNumber', '')
services.append({
'depart_st_pancras': dep_time,
'arrive_destination': arr_time,
'destination': destination,
'train_number': f"{carrier} {number}" if number else '',
})
return sorted(services, key=lambda s: s['depart_st_pancras'])
def fetch(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
url = timetable_url(destination)
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-GB,en;q=0.9',
}
with httpx.Client(headers=headers, follow_redirects=True, timeout=20) as client:
r = client.get(url, params={'date': travel_date})
r.raise_for_status()
return _parse(r.text, destination)