Initial commit.

2026-03-30 19:34:46 +01:00 · 2026-03-30 19:34:46 +01:00 · a8e0bd39e5
commit a8e0bd39e5
16 changed files with 981 additions and 0 deletions
--- a/scraper/init.py
+++ b/scraper/init.py
--- a/scraper/eurostar.py
+++ b/scraper/eurostar.py
@ -0,0 +1,90 @@
+"""
+Scrape Eurostar timetable via httpx.
+
+The route-specific timetable pages are Next.js SSR — all departure data is
+embedded in <script id="__NEXT_DATA__"> as JSON, so no browser / JS needed.
+
+URL pattern:
+  https://www.eurostar.com/uk-en/travel-info/timetable/
+  {origin_id}/{dest_id}/{origin_slug}/{dest_slug}?date=YYYY-MM-DD
+
+Data path: props.pageProps.pageData.liveDepartures[]
+  .origin.model.scheduledDepartureDateTime  → London departure
+  .destination.model.scheduledArrivalDateTime → destination arrival
+    (already filtered to the requested stop, not the final stop)
+"""
+import asyncio
+import json
+import re
+import httpx
+
+DEFAULT_UA = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+)
+
+ROUTE_URLS = {
+    'Paris Gare du Nord': (
+        'https://www.eurostar.com/uk-en/travel-info/timetable/'
+        '7015400/8727100/london-st-pancras-intl/paris-gare-du-nord'
+    ),
+    'Brussels Midi': (
+        'https://www.eurostar.com/uk-en/travel-info/timetable/'
+        '7015400/8814001/london-st-pancras-intl/brussels-midi'
+    ),
+    'Lille Europe': (
+        'https://www.eurostar.com/uk-en/travel-info/timetable/'
+        '7015400/8722326/london-st-pancras-intl/lille-europe'
+    ),
+    'Amsterdam Centraal': (
+        'https://www.eurostar.com/uk-en/travel-info/timetable/'
+        '7015400/8400058/london-st-pancras-intl/amsterdam-centraal'
+    ),
+}
+
+
+def _hhmm(dt_str: str | None) -> str | None:
+    """'2026-03-30 09:34:00' → '09:34'"""
+    if not dt_str:
+        return None
+    m = re.search(r'(\d{2}):(\d{2}):\d{2}$', dt_str)
+    return f"{m.group(1)}:{m.group(2)}" if m else None
+
+
+def _parse(html: str, destination: str) -> list[dict]:
+    m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
+    if not m:
+        return []
+    data = json.loads(m.group(1))
+    departures = data['props']['pageProps']['pageData']['liveDepartures']
+    services = []
+    for dep in departures:
+        dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime'])
+        arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime'])
+        if dep_time and arr_time:
+            services.append({
+                'depart_st_pancras': dep_time,
+                'arrive_destination': arr_time,
+                'destination': destination,
+            })
+    return sorted(services, key=lambda s: s['depart_st_pancras'])
+
+
+async def fetch(destination: str, travel_date: str,
+                user_agent: str = DEFAULT_UA) -> list[dict]:
+    url = ROUTE_URLS[destination]
+    headers = {
+        'User-Agent': user_agent,
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-GB,en;q=0.9',
+    }
+    async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=20) as client:
+        r = await client.get(url, params={'date': travel_date})
+    r.raise_for_status()
+    return _parse(r.text, destination)
+
+
+def get_eurostar_times(destination: str, travel_date: str,
+                       user_agent: str = DEFAULT_UA) -> list[dict]:
+    """Synchronous wrapper for CLI/testing."""
+    return asyncio.run(fetch(destination, travel_date, user_agent))
--- a/scraper/realtime_trains.py
+++ b/scraper/realtime_trains.py
@ -0,0 +1,96 @@
+"""
+Scrape GWR trains from Bristol Temple Meads to London Paddington using Realtime Trains.
+
+Uses httpx (not Playwright) with browser-like headers.
+
+Two fetches run concurrently:
+  BRI/to/PAD  → departure times from Bristol (div.time.plan.d)
+  PAD/from/BRI → arrival times at Paddington (div.time.plan.a)
+Matched by train ID (div.tid).
+"""
+import asyncio
+import re
+import httpx
+import lxml.html
+
+BRI_TO_PAD = (
+    "https://www.realtimetrains.co.uk/search/detailed/"
+    "gb-nr:BRI/to/gb-nr:PAD/{date}/0000-2359"
+    "?stp=WVS&show=pax-calls&order=wtt"
+)
+PAD_FROM_BRI = (
+    "https://www.realtimetrains.co.uk/search/detailed/"
+    "gb-nr:PAD/from/gb-nr:BRI/{date}/0000-2359"
+    "?stp=WVS&show=pax-calls&order=wtt"
+)
+
+DEFAULT_UA = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+)
+
+
+def _browser_headers(user_agent: str) -> dict:
+    return {
+        "User-Agent": user_agent,
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-GB,en;q=0.9",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+    }
+
+
+def _fmt(hhmm: str) -> str:
+    """Convert '0830' → '08:30'."""
+    hhmm = re.sub(r'[^0-9]', '', hhmm)
+    if len(hhmm) == 4:
+        return f"{hhmm[:2]}:{hhmm[2:]}"
+    return hhmm
+
+
+def _parse_services(html: str, time_selector: str) -> dict[str, str]:
+    """Return {train_id: time_string} from a servicelist page."""
+    root = lxml.html.fromstring(html)
+    sl = root.cssselect('div.servicelist')
+    if not sl:
+        return {}
+    result = {}
+    for svc in sl[0].cssselect('a.service'):
+        tid_els  = svc.cssselect('div.tid')
+        time_els = svc.cssselect(time_selector)
+        if tid_els and time_els:
+            tid = tid_els[0].text_content().strip()
+            time_text = time_els[0].text_content().strip()
+            if time_text:
+                result[tid] = _fmt(time_text)
+    return result
+
+
+async def fetch(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
+    """Fetch GWR trains concurrently; returns [{'depart_bristol', 'arrive_paddington'}]."""
+    headers = _browser_headers(user_agent)
+    async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=30) as client:
+        r_bri, r_pad = await asyncio.gather(
+            client.get(BRI_TO_PAD.format(date=date)),
+            client.get(PAD_FROM_BRI.format(date=date)),
+        )
+
+    departures = _parse_services(r_bri.text, 'div.time.plan.d')
+    arrivals   = _parse_services(r_pad.text, 'div.time.plan.a')
+
+    trains = [
+        {'depart_bristol': dep, 'arrive_paddington': arr}
+        for tid, dep in departures.items()
+        if (arr := arrivals.get(tid))
+    ]
+    return sorted(trains, key=lambda t: t['depart_bristol'])
+
+
+def get_gwr_trains(date: str, user_agent: str = DEFAULT_UA) -> list[dict]:
+    """Synchronous wrapper around fetch() for CLI/testing use."""
+    return asyncio.run(fetch(date, user_agent))