diff --git a/app.py b/app.py index 0ca44c7..6425e79 100644 --- a/app.py +++ b/app.py @@ -9,7 +9,6 @@ from cache import get_cached, set_cached import scraper.eurostar as eurostar_scraper import scraper.realtime_trains as rtt_scraper from trip_planner import combine_trips, find_unreachable_morning_eurostars -from scraper.eurostar import fetch_prices as fetch_eurostar_prices RTT_PADDINGTON_URL = ( "https://www.realtimetrains.co.uk/search/detailed/" @@ -103,12 +102,10 @@ def results(slug, travel_date): rtt_cache_key = f"rtt_{travel_date}" es_cache_key = f"eurostar_{travel_date}_{destination}" - prices_cache_key = f"eurostar_prices_{travel_date}_{destination}" cached_rtt = get_cached(rtt_cache_key) - cached_es = get_cached(es_cache_key) - cached_prices = get_cached(prices_cache_key, ttl=24 * 3600) - from_cache = bool(cached_rtt and cached_es and cached_prices) + cached_es = get_cached(es_cache_key, ttl=24 * 3600) + from_cache = bool(cached_rtt and cached_es) error = None @@ -123,26 +120,21 @@ def results(slug, travel_date): error = f"Could not fetch GWR trains: {e}" if cached_es: - eurostar_trains = cached_es + eurostar_services = cached_es else: try: - eurostar_trains = eurostar_scraper.fetch(destination, travel_date, user_agent) - set_cached(es_cache_key, eurostar_trains) + eurostar_services = eurostar_scraper.fetch(destination, travel_date) + set_cached(es_cache_key, eurostar_services) except Exception as e: - eurostar_trains = [] + eurostar_services = [] msg = f"Could not fetch Eurostar times: {e}" error = f"{error}; {msg}" if error else msg - if cached_prices: - eurostar_prices = cached_prices - else: - try: - eurostar_prices = fetch_eurostar_prices(destination, travel_date) - set_cached(prices_cache_key, eurostar_prices) - except Exception as e: - eurostar_prices = {} - msg = f"Could not fetch Eurostar prices: {e}" - error = f"{error}; {msg}" if error else msg + eurostar_trains = eurostar_services + eurostar_prices = { + s['depart_st_pancras']: {'price': s.get('price'), 'seats': s.get('seats')} + for s in eurostar_services + } trips = combine_trips(gwr_trains, eurostar_trains, travel_date, min_connection, max_connection) diff --git a/scraper/eurostar.py b/scraper/eurostar.py index f12f283..ca489d9 100644 --- a/scraper/eurostar.py +++ b/scraper/eurostar.py @@ -1,29 +1,14 @@ """ -Scrape Eurostar timetable via httpx and fetch prices via the GraphQL API. +Fetch Eurostar timetable, prices, and seat availability via the GraphQL API. -Timetable: route-specific pages are Next.js SSR — all departure data is -embedded in ', html, re.DOTALL) - if not m: - return [] - data = json.loads(m.group(1)) - departures = data['props']['pageProps']['pageData']['liveDepartures'] - services = [] - for dep in departures: - dep_time = _hhmm(dep['origin']['model']['scheduledDepartureDateTime']) - arr_time = _hhmm(dep['destination']['model']['scheduledArrivalDateTime']) - if dep_time and arr_time: - carrier = dep.get('model', {}).get('carrier', 'ES') - number = dep.get('model', {}).get('trainNumber', '') - services.append({ - 'depart_st_pancras': dep_time, - 'arrive_destination': arr_time, - 'destination': destination, - 'train_number': f"{carrier} {number}" if number else '', - }) - return sorted(services, key=lambda s: s['depart_st_pancras']) - - -def fetch(destination: str, travel_date: str, - user_agent: str = DEFAULT_UA) -> list[dict]: - url = timetable_url(destination) - headers = { - 'User-Agent': user_agent, - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-GB,en;q=0.9', - } - with httpx.Client(headers=headers, follow_redirects=True, timeout=20) as client: - r = client.get(url, params={'date': travel_date}) - r.raise_for_status() - return _parse(r.text, destination) - - -# --------------------------------------------------------------------------- -# Price fetching via site-api.eurostar.com GraphQL -# --------------------------------------------------------------------------- - _GATEWAY_URL = 'https://site-api.eurostar.com/gateway' -# Minimal query requesting only timing + Eurostar Standard fare price. -# Variable names and inline argument names match what the site sends so the +# Query requesting timing, train identity, and Standard fare price + seats. +# Variable names and argument names match the site's own query so the # server-side query planner sees a familiar shape. -_GQL_PRICES = ( +_GQL_QUERY = ( "query NewBookingSearch(" "$origin:String!,$destination:String!,$outbound:String!," "$currency:Currency!,$adult:Int," @@ -141,74 +54,103 @@ _GQL_PRICES = ( " hideExternalCarrierTrains:true" " hideDirectExternalCarrierTrains:true" "){" - "timing{departureTime:departs __typename}" + "timing{departureTime:departs arrivalTime:arrives}" "fares(filteredClassesOfService:$filteredClassesOfService){" - "classOfService{code __typename}" - "prices{displayPrice __typename}" - "seats __typename" + "classOfService{code}" + "prices{displayPrice}" + "seats " + "legs{serviceName serviceType{code}}" "}" - "__typename" "}" - "__typename" "}" - "__typename" "}" "}" ) +def search_url(destination: str, travel_date: str) -> str: + dest_id = DESTINATION_STATION_IDS[destination] + return ( + f'https://www.eurostar.com/search/uk-en' + f'?adult=1&origin={ORIGIN_STATION_ID}&destination={dest_id}&outbound={travel_date}' + ) + + def _generate_cid() -> str: chars = string.ascii_letters + string.digits return 'SRCH-' + ''.join(random.choices(chars, k=22)) -def fetch_prices(destination: str, travel_date: str) -> dict[str, dict]: +def _parse_graphql(data: dict, destination: str) -> list[dict]: """ - Return Eurostar Standard price and seat availability for every departure on travel_date. + Parse a NewBookingSearch GraphQL response into a list of service dicts. - Result: {depart_st_pancras: {'price': int_or_None, 'seats': int_or_None}} - price is None when unavailable/not yet on sale; seats is the number of - Standard seats currently available for sale. + Each dict contains: depart_st_pancras, arrive_destination, destination, + train_number, price (float or None), seats (int or None). + + The same St Pancras departure can appear multiple times (different + connecting trains); we keep the entry with the earliest arrival. + Multi-leg train numbers are joined with ' + ' (e.g. 'ES 9116 + ER 9329'). + """ + best: dict[str, dict] = {} + journeys = data['data']['journeySearch']['outbound']['journeys'] + for journey in journeys: + dep = journey['timing']['departureTime'] + arr = journey['timing']['arrivalTime'] + for fare in journey['fares']: + if fare['classOfService']['code'] == 'STANDARD': + p = fare.get('prices') + price = float(p['displayPrice']) if p and p.get('displayPrice') else None + seats = fare.get('seats') + legs = fare.get('legs') or [] + train_number = ' + '.join( + f"{(leg.get('serviceType') or {}).get('code', 'ES')} {leg['serviceName']}" + for leg in legs if leg.get('serviceName') + ) + if dep not in best or arr < best[dep]['arrive_destination']: + best[dep] = { + 'depart_st_pancras': dep, + 'arrive_destination': arr, + 'destination': destination, + 'train_number': train_number, + 'price': price, + 'seats': seats, + } + break + return sorted(best.values(), key=lambda s: s['depart_st_pancras']) + + +def fetch(destination: str, travel_date: str) -> list[dict]: + """ + Return all Eurostar services for destination on travel_date. + + Each dict contains timetable info (depart_st_pancras, arrive_destination, + train_number) plus pricing (price, seats) from a single GraphQL call. """ dest_id = DESTINATION_STATION_IDS[destination] headers = { - 'User-Agent': DEFAULT_UA, - 'Content-Type': 'application/json', - 'Accept': '*/*', - 'Accept-Language': 'en-GB', - 'Referer': 'https://www.eurostar.com/', - 'x-platform': 'web', - 'x-market-code': 'uk', - 'x-source-url': 'search-app/', - 'cid': _generate_cid(), + 'User-Agent': DEFAULT_UA, + 'Content-Type': 'application/json', + 'Accept': '*/*', + 'Accept-Language':'en-GB', + 'Referer': 'https://www.eurostar.com/', + 'x-platform': 'web', + 'x-market-code': 'uk', + 'x-source-url': 'search-app/', + 'cid': _generate_cid(), } payload = { 'operationName': 'NewBookingSearch', 'variables': { - 'origin': ORIGIN_STATION_ID, - 'destination': dest_id, - 'outbound': travel_date, - 'currency': 'GBP', - 'adult': 1, - 'filteredClassesOfService': ['STANDARD'], + 'origin': ORIGIN_STATION_ID, + 'destination': dest_id, + 'outbound': travel_date, + 'currency': 'GBP', + 'adult': 1, + 'filteredClassesOfService': ['STANDARD'], }, - 'query': _GQL_PRICES, + 'query': _GQL_QUERY, } resp = requests.post(_GATEWAY_URL, json=payload, headers=headers, timeout=20) resp.raise_for_status() - data = resp.json() - prices: dict[str, dict] = {} - journeys = data['data']['journeySearch']['outbound']['journeys'] - for journey in journeys: - dep = journey['timing']['departureTime'] - price = None - seats = None - for fare in journey['fares']: - if fare['classOfService']['code'] == 'STANDARD': - p = fare.get('prices') - if p and p.get('displayPrice'): - price = int(p['displayPrice']) - seats = fare.get('seats') - break - prices[dep] = {'price': price, 'seats': seats} - return prices + return _parse_graphql(resp.json(), destination) diff --git a/templates/results.html b/templates/results.html index 634ede8..e4578ec 100644 --- a/templates/results.html +++ b/templates/results.html @@ -159,7 +159,7 @@ {{ row.depart_st_pancras }} - {% if row.train_number %}
{{ row.train_number }}{% endif %} + {% if row.train_number %}
{% for part in row.train_number.split(' + ') %}{{ part }}{% if not loop.last %} + {% endif %}{% endfor %}{% endif %} {{ row.arrive_destination }} @@ -168,7 +168,7 @@ {% if row.eurostar_price is not none %} - £{{ row.eurostar_price }} + £{{ "%.2f"|format(row.eurostar_price) }} {% if row.eurostar_seats is not none %}
{{ row.eurostar_seats }} at this price {% endif %} @@ -195,7 +195,7 @@ n/a {{ row.depart_st_pancras }} - {% if row.train_number %}
{{ row.train_number }}{% endif %} + {% if row.train_number %}
{% for part in row.train_number.split(' + ') %}{{ part }}{% if not loop.last %} + {% endif %}{% endfor %}{% endif %} {{ row.arrive_destination }} @@ -204,7 +204,7 @@ {% if row.eurostar_price is not none %} - £{{ row.eurostar_price }} + £{{ "%.2f"|format(row.eurostar_price) }} {% if row.eurostar_seats is not none %}
{{ row.eurostar_seats }} at this price {% endif %} diff --git a/tests/test_app.py b/tests/test_app.py index 8806137..fd06a8d 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -16,25 +16,21 @@ def _stub_data(monkeypatch, prices=None): {'depart_bristol': '07:00', 'arrive_paddington': '08:45', 'headcode': '1A23'}, ], ) + p = (prices or {}).get('10:01', {}) monkeypatch.setattr( app_module.eurostar_scraper, 'fetch', - lambda destination, travel_date, user_agent: [ + lambda destination, travel_date: [ { 'depart_st_pancras': '10:01', 'arrive_destination': '13:34', 'destination': destination, 'train_number': 'ES 9014', + 'price': p.get('price') if isinstance(p, dict) else None, + 'seats': p.get('seats') if isinstance(p, dict) else None, }, ], ) - monkeypatch.setattr( - app_module.eurostar_scraper, - 'timetable_url', - lambda destination: f'https://example.test/{destination.lower().replace(" ", "-")}', - ) - _prices = prices if prices is not None else {} - monkeypatch.setattr(app_module, 'fetch_eurostar_prices', lambda dest, date: _prices) def test_index_shows_fixed_departure_and_destination_radios(): @@ -96,7 +92,6 @@ def test_results_title_and_social_meta_include_destination(monkeypatch): def test_results_marks_trips_within_five_minutes_of_fastest_and_slowest(monkeypatch): monkeypatch.setattr(app_module, 'get_cached', lambda key, ttl=None: None) monkeypatch.setattr(app_module, 'set_cached', lambda key, data: None) - monkeypatch.setattr(app_module, 'fetch_eurostar_prices', lambda dest, date: {}) monkeypatch.setattr( app_module.rtt_scraper, 'fetch', @@ -111,44 +106,14 @@ def test_results_marks_trips_within_five_minutes_of_fastest_and_slowest(monkeypa monkeypatch.setattr( app_module.eurostar_scraper, 'fetch', - lambda destination, travel_date, user_agent: [ - { - 'depart_st_pancras': '09:30', - 'arrive_destination': '11:50', - 'destination': destination, - 'train_number': 'ES 1001', - }, - { - 'depart_st_pancras': '09:40', - 'arrive_destination': '12:00', - 'destination': destination, - 'train_number': 'ES 1002', - }, - { - 'depart_st_pancras': '09:50', - 'arrive_destination': '12:20', - 'destination': destination, - 'train_number': 'ES 1003', - }, - { - 'depart_st_pancras': '10:00', - 'arrive_destination': '12:35', - 'destination': destination, - 'train_number': 'ES 1004', - }, - { - 'depart_st_pancras': '10:10', - 'arrive_destination': '12:45', - 'destination': destination, - 'train_number': 'ES 1005', - }, + lambda destination, travel_date: [ + {'depart_st_pancras': '09:30', 'arrive_destination': '11:50', 'destination': destination, 'train_number': 'ES 1001', 'price': None, 'seats': None}, + {'depart_st_pancras': '09:40', 'arrive_destination': '12:00', 'destination': destination, 'train_number': 'ES 1002', 'price': None, 'seats': None}, + {'depart_st_pancras': '09:50', 'arrive_destination': '12:20', 'destination': destination, 'train_number': 'ES 1003', 'price': None, 'seats': None}, + {'depart_st_pancras': '10:00', 'arrive_destination': '12:35', 'destination': destination, 'train_number': 'ES 1004', 'price': None, 'seats': None}, + {'depart_st_pancras': '10:10', 'arrive_destination': '12:45', 'destination': destination, 'train_number': 'ES 1005', 'price': None, 'seats': None}, ], ) - monkeypatch.setattr( - app_module.eurostar_scraper, - 'timetable_url', - lambda destination: f'https://example.test/{destination.lower().replace(" ", "-")}', - ) client = _client() resp = client.get('/results/paris/2026-04-10?min_connection=60&max_connection=120') @@ -168,7 +133,6 @@ def test_results_marks_trips_within_five_minutes_of_fastest_and_slowest(monkeypa def test_results_shows_unreachable_morning_eurostar_services(monkeypatch): monkeypatch.setattr(app_module, 'get_cached', lambda key, ttl=None: None) monkeypatch.setattr(app_module, 'set_cached', lambda key, data: None) - monkeypatch.setattr(app_module, 'fetch_eurostar_prices', lambda dest, date: {}) monkeypatch.setattr( app_module.rtt_scraper, 'fetch', @@ -179,32 +143,12 @@ def test_results_shows_unreachable_morning_eurostar_services(monkeypatch): monkeypatch.setattr( app_module.eurostar_scraper, 'fetch', - lambda destination, travel_date, user_agent: [ - { - 'depart_st_pancras': '09:30', - 'arrive_destination': '12:00', - 'destination': destination, - 'train_number': 'ES 9001', - }, - { - 'depart_st_pancras': '10:15', - 'arrive_destination': '13:40', - 'destination': destination, - 'train_number': 'ES 9002', - }, - { - 'depart_st_pancras': '12:30', - 'arrive_destination': '15:55', - 'destination': destination, - 'train_number': 'ES 9003', - }, + lambda destination, travel_date: [ + {'depart_st_pancras': '09:30', 'arrive_destination': '12:00', 'destination': destination, 'train_number': 'ES 9001', 'price': None, 'seats': None}, + {'depart_st_pancras': '10:15', 'arrive_destination': '13:40', 'destination': destination, 'train_number': 'ES 9002', 'price': None, 'seats': None}, + {'depart_st_pancras': '12:30', 'arrive_destination': '15:55', 'destination': destination, 'train_number': 'ES 9003', 'price': None, 'seats': None}, ], ) - monkeypatch.setattr( - app_module.eurostar_scraper, - 'timetable_url', - lambda destination: f'https://example.test/{destination.lower().replace(" ", "-")}', - ) client = _client() resp = client.get('/results/paris/2026-04-10?min_connection=60&max_connection=120') @@ -234,7 +178,6 @@ def test_results_shows_eurostar_price_and_total(monkeypatch): def test_results_can_show_only_unreachable_morning_services(monkeypatch): monkeypatch.setattr(app_module, 'get_cached', lambda key, ttl=None: None) monkeypatch.setattr(app_module, 'set_cached', lambda key, data: None) - monkeypatch.setattr(app_module, 'fetch_eurostar_prices', lambda dest, date: {}) monkeypatch.setattr( app_module.rtt_scraper, 'fetch', @@ -245,20 +188,10 @@ def test_results_can_show_only_unreachable_morning_services(monkeypatch): monkeypatch.setattr( app_module.eurostar_scraper, 'fetch', - lambda destination, travel_date, user_agent: [ - { - 'depart_st_pancras': '09:30', - 'arrive_destination': '12:00', - 'destination': destination, - 'train_number': 'ES 9001', - }, + lambda destination, travel_date: [ + {'depart_st_pancras': '09:30', 'arrive_destination': '12:00', 'destination': destination, 'train_number': 'ES 9001', 'price': None, 'seats': None}, ], ) - monkeypatch.setattr( - app_module.eurostar_scraper, - 'timetable_url', - lambda destination: f'https://example.test/{destination.lower().replace(" ", "-")}', - ) client = _client() resp = client.get('/results/paris/2026-04-10?min_connection=60&max_connection=120') diff --git a/tests/test_eurostar_scraper.py b/tests/test_eurostar_scraper.py index 1015b8a..ed82358 100644 --- a/tests/test_eurostar_scraper.py +++ b/tests/test_eurostar_scraper.py @@ -1,97 +1,92 @@ -import json import pytest -from scraper.eurostar import _hhmm, _parse, timetable_url +from scraper.eurostar import _parse_graphql, search_url -# --------------------------------------------------------------------------- -# _hhmm -# --------------------------------------------------------------------------- - -def test_hhmm_parses_datetime_string(): - assert _hhmm('2026-03-30 09:34:00') == '09:34' - -def test_hhmm_none_input(): - assert _hhmm(None) is None - -def test_hhmm_empty_string(): - assert _hhmm('') is None +def _gql_response(journeys: list) -> dict: + return {'data': {'journeySearch': {'outbound': {'journeys': journeys}}}} -# --------------------------------------------------------------------------- -# _parse -# --------------------------------------------------------------------------- - -def _make_next_data(departures: list) -> str: - data = { - 'props': { - 'pageProps': { - 'pageData': { - 'liveDepartures': departures - } - } - } - } - return f'' - - -def _departure(dep_dt: str, arr_dt: str) -> dict: +def _journey(departs: str, arrives: str, price=None, seats=None, service_name='', carrier='ES') -> dict: return { - 'origin': {'model': {'scheduledDepartureDateTime': dep_dt}}, - 'destination': {'model': {'scheduledArrivalDateTime': arr_dt}}, + 'timing': {'departureTime': departs, 'arrivalTime': arrives}, + 'fares': [{ + 'classOfService': {'code': 'STANDARD'}, + 'prices': {'displayPrice': price}, + 'seats': seats, + 'legs': [{'serviceName': service_name, 'serviceType': {'code': carrier}}] + if service_name else [], + }], } -def test_parse_single_departure(): - html = _make_next_data([_departure('2026-03-30 06:01:00', '2026-03-30 09:34:00')]) - services = _parse(html, 'Paris Gare du Nord') +# --------------------------------------------------------------------------- +# _parse_graphql +# --------------------------------------------------------------------------- + +def test_parse_graphql_single_journey(): + data = _gql_response([_journey('09:31', '12:55', price=156, seats=37, service_name='9014')]) + services = _parse_graphql(data, 'Paris Gare du Nord') assert len(services) == 1 - assert services[0] == { - 'depart_st_pancras': '06:01', - 'arrive_destination': '09:34', - 'destination': 'Paris Gare du Nord', - 'train_number': '', - } + s = services[0] + assert s['depart_st_pancras'] == '09:31' + assert s['arrive_destination'] == '12:55' + assert s['destination'] == 'Paris Gare du Nord' + assert s['train_number'] == 'ES 9014' + assert s['price'] == 156.0 + assert s['seats'] == 37 -def test_parse_results_sorted_by_departure(): - html = _make_next_data([ - _departure('2026-03-30 10:00:00', '2026-03-30 13:00:00'), - _departure('2026-03-30 07:00:00', '2026-03-30 10:00:00'), +def test_parse_graphql_half_pound_price(): + data = _gql_response([_journey('09:01', '14:20', price=192.5, seats=25, service_name='9116')]) + services = _parse_graphql(data, 'Amsterdam Centraal') + assert services[0]['price'] == 192.5 + + +def test_parse_graphql_null_price(): + data = _gql_response([_journey('06:16', '11:09', price=None, seats=0)]) + services = _parse_graphql(data, 'Amsterdam Centraal') + assert services[0]['price'] is None + assert services[0]['seats'] == 0 + + +def test_parse_graphql_sorted_by_departure(): + data = _gql_response([ + _journey('10:31', '13:55'), + _journey('07:31', '10:59'), ]) - services = _parse(html, 'Paris Gare du Nord') - assert services[0]['depart_st_pancras'] == '07:00' - assert services[1]['depart_st_pancras'] == '10:00' + services = _parse_graphql(data, 'Paris Gare du Nord') + assert services[0]['depart_st_pancras'] == '07:31' + assert services[1]['depart_st_pancras'] == '10:31' -def test_parse_skips_entries_with_missing_times(): - html = _make_next_data([ - _departure(None, '2026-03-30 09:34:00'), - _departure('2026-03-30 08:00:00', None), - _departure('2026-03-30 09:00:00', '2026-03-30 12:00:00'), +def test_parse_graphql_deduplicates_same_departure_time(): + data = _gql_response([ + _journey('06:16', '11:09', price=None, seats=0), + _journey('06:16', '11:09', price=None, seats=0), + _journey('06:16', '11:09', price=None, seats=0), ]) - services = _parse(html, 'Paris Gare du Nord') + services = _parse_graphql(data, 'Amsterdam Centraal') assert len(services) == 1 - assert services[0]['depart_st_pancras'] == '09:00' -def test_parse_no_next_data_returns_empty(): - assert _parse('nothing here', 'Paris Gare du Nord') == [] +def test_parse_graphql_no_legs_gives_empty_train_number(): + data = _gql_response([_journey('09:31', '12:55', price=156, seats=37, service_name='')]) + services = _parse_graphql(data, 'Paris Gare du Nord') + assert services[0]['train_number'] == '' -def test_parse_empty_departures(): - html = _make_next_data([]) - assert _parse(html, 'Paris Gare du Nord') == [] +def test_parse_graphql_empty_journeys(): + data = _gql_response([]) + assert _parse_graphql(data, 'Paris Gare du Nord') == [] -def test_timetable_url_uses_station_id_table(): - assert timetable_url('Paris Gare du Nord') == ( - 'https://www.eurostar.com/uk-en/travel-info/timetable/' - '7015400/8727100/london-st-pancras-intl/paris-gare-du-nord' - ) - - -def test_timetable_url_slugifies_destination_name(): - assert timetable_url('Rotterdam Centraal') == ( - 'https://www.eurostar.com/uk-en/travel-info/timetable/' - '7015400/8400530/london-st-pancras-intl/rotterdam-centraal' +# --------------------------------------------------------------------------- +# search_url +# --------------------------------------------------------------------------- + +def test_search_url(): + url = search_url('Paris Gare du Nord', '2026-04-10') + assert url == ( + 'https://www.eurostar.com/search/uk-en' + '?adult=1&origin=7015400&destination=8727100&outbound=2026-04-10' )