Initial commit.

This commit is contained in:
Edward Betts 2026-03-30 19:34:46 +01:00
commit a8e0bd39e5
16 changed files with 981 additions and 0 deletions

90
scraper/eurostar.py Normal file
View file

@ -0,0 +1,90 @@
"""
Scrape Eurostar timetable via httpx.
The route-specific timetable pages are Next.js SSR all departure data is
embedded in <script id="__NEXT_DATA__"> as JSON, so no browser / JS needed.
URL pattern:
https://www.eurostar.com/uk-en/travel-info/timetable/
{origin_id}/{dest_id}/{origin_slug}/{dest_slug}?date=YYYY-MM-DD
Data path: props.pageProps.pageData.liveDepartures[]
.origin.model.scheduledDepartureDateTime London departure
.destination.model.scheduledArrivalDateTime destination arrival
(already filtered to the requested stop, not the final stop)
"""
import asyncio
import json
import re
import httpx
DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
ROUTE_URLS = {
'Paris Gare du Nord': (
'https://www.eurostar.com/uk-en/travel-info/timetable/'
'7015400/8727100/london-st-pancras-intl/paris-gare-du-nord'
),
'Brussels Midi': (
'https://www.eurostar.com/uk-en/travel-info/timetable/'
'7015400/8814001/london-st-pancras-intl/brussels-midi'
),
'Lille Europe': (
'https://www.eurostar.com/uk-en/travel-info/timetable/'
'7015400/8722326/london-st-pancras-intl/lille-europe'
),
'Amsterdam Centraal': (
'https://www.eurostar.com/uk-en/travel-info/timetable/'
'7015400/8400058/london-st-pancras-intl/amsterdam-centraal'
),
}
def _hhmm(dt_str: str | None) -> str | None:
"""'2026-03-30 09:34:00''09:34'"""
if not dt_str:
return None
m = re.search(r'(\d{2}):(\d{2}):\d{2}$', dt_str)
return f"{m.group(1)}:{m.group(2)}" if m else None
def _parse(html: str, destination: str) -> list[dict]:
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not m:
return []
data = json.loads(m.group(1))
departures = data['props']['pageProps']['pageData']['liveDepartures']
services = []
for dep in departures:
dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
if dep_time and arr_time:
services.append({
'depart_st_pancras': dep_time,
'arrive_destination': arr_time,
'destination': destination,
})
return sorted(services, key=lambda s: s['depart_st_pancras'])
async def fetch(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
url = ROUTE_URLS[destination]
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-GB,en;q=0.9',
}
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=20) as client:
r = await client.get(url, params={'date': travel_date})
r.raise_for_status()
return _parse(r.text, destination)
def get_eurostar_times(destination: str, travel_date: str,
user_agent: str = DEFAULT_UA) -> list[dict]:
"""Synchronous wrapper for CLI/testing."""
return asyncio.run(fetch(destination, travel_date, user_agent))