Consolidate to single GraphQL call; show indirect trains; fix price formatting

Replace two-step Eurostar fetch (HTML timetable + GraphQL prices) with a
single GraphQL call that returns timing, train numbers, prices, and seats.
Support indirect services (e.g. Amsterdam) by joining multi-leg train numbers
with ' + ' and keeping the earliest arrival per departure time.
Fix half-pound prices by casting displayPrice to float instead of int.
Wrap each train number segment in white-space:nowrap so 'ES 9132 + ER 9363'
never breaks mid-segment.
Format Eurostar prices with two decimal places.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-04-04 14:46:22 +01:00
parent 05eec29b7d
commit c22a3ea0fc
5 changed files with 182 additions and 320 deletions

View file

@ -1,97 +1,92 @@
import json
import pytest
from scraper.eurostar import _hhmm, _parse, timetable_url
from scraper.eurostar import _parse_graphql, search_url
# ---------------------------------------------------------------------------
# _hhmm
# ---------------------------------------------------------------------------
def test_hhmm_parses_datetime_string():
assert _hhmm('2026-03-30 09:34:00') == '09:34'
def test_hhmm_none_input():
assert _hhmm(None) is None
def test_hhmm_empty_string():
assert _hhmm('') is None
def _gql_response(journeys: list) -> dict:
return {'data': {'journeySearch': {'outbound': {'journeys': journeys}}}}
# ---------------------------------------------------------------------------
# _parse
# ---------------------------------------------------------------------------
def _make_next_data(departures: list) -> str:
data = {
'props': {
'pageProps': {
'pageData': {
'liveDepartures': departures
}
}
}
}
return f'<script id="__NEXT_DATA__" type="application/json">{json.dumps(data)}</script>'
def _departure(dep_dt: str, arr_dt: str) -> dict:
def _journey(departs: str, arrives: str, price=None, seats=None, service_name='', carrier='ES') -> dict:
return {
'origin': {'model': {'scheduledDepartureDateTime': dep_dt}},
'destination': {'model': {'scheduledArrivalDateTime': arr_dt}},
'timing': {'departureTime': departs, 'arrivalTime': arrives},
'fares': [{
'classOfService': {'code': 'STANDARD'},
'prices': {'displayPrice': price},
'seats': seats,
'legs': [{'serviceName': service_name, 'serviceType': {'code': carrier}}]
if service_name else [],
}],
}
def test_parse_single_departure():
html = _make_next_data([_departure('2026-03-30 06:01:00', '2026-03-30 09:34:00')])
services = _parse(html, 'Paris Gare du Nord')
# ---------------------------------------------------------------------------
# _parse_graphql
# ---------------------------------------------------------------------------
def test_parse_graphql_single_journey():
data = _gql_response([_journey('09:31', '12:55', price=156, seats=37, service_name='9014')])
services = _parse_graphql(data, 'Paris Gare du Nord')
assert len(services) == 1
assert services[0] == {
'depart_st_pancras': '06:01',
'arrive_destination': '09:34',
'destination': 'Paris Gare du Nord',
'train_number': '',
}
s = services[0]
assert s['depart_st_pancras'] == '09:31'
assert s['arrive_destination'] == '12:55'
assert s['destination'] == 'Paris Gare du Nord'
assert s['train_number'] == 'ES 9014'
assert s['price'] == 156.0
assert s['seats'] == 37
def test_parse_results_sorted_by_departure():
html = _make_next_data([
_departure('2026-03-30 10:00:00', '2026-03-30 13:00:00'),
_departure('2026-03-30 07:00:00', '2026-03-30 10:00:00'),
def test_parse_graphql_half_pound_price():
data = _gql_response([_journey('09:01', '14:20', price=192.5, seats=25, service_name='9116')])
services = _parse_graphql(data, 'Amsterdam Centraal')
assert services[0]['price'] == 192.5
def test_parse_graphql_null_price():
data = _gql_response([_journey('06:16', '11:09', price=None, seats=0)])
services = _parse_graphql(data, 'Amsterdam Centraal')
assert services[0]['price'] is None
assert services[0]['seats'] == 0
def test_parse_graphql_sorted_by_departure():
data = _gql_response([
_journey('10:31', '13:55'),
_journey('07:31', '10:59'),
])
services = _parse(html, 'Paris Gare du Nord')
assert services[0]['depart_st_pancras'] == '07:00'
assert services[1]['depart_st_pancras'] == '10:00'
services = _parse_graphql(data, 'Paris Gare du Nord')
assert services[0]['depart_st_pancras'] == '07:31'
assert services[1]['depart_st_pancras'] == '10:31'
def test_parse_skips_entries_with_missing_times():
html = _make_next_data([
_departure(None, '2026-03-30 09:34:00'),
_departure('2026-03-30 08:00:00', None),
_departure('2026-03-30 09:00:00', '2026-03-30 12:00:00'),
def test_parse_graphql_deduplicates_same_departure_time():
data = _gql_response([
_journey('06:16', '11:09', price=None, seats=0),
_journey('06:16', '11:09', price=None, seats=0),
_journey('06:16', '11:09', price=None, seats=0),
])
services = _parse(html, 'Paris Gare du Nord')
services = _parse_graphql(data, 'Amsterdam Centraal')
assert len(services) == 1
assert services[0]['depart_st_pancras'] == '09:00'
def test_parse_no_next_data_returns_empty():
assert _parse('<html><body>nothing here</body></html>', 'Paris Gare du Nord') == []
def test_parse_graphql_no_legs_gives_empty_train_number():
data = _gql_response([_journey('09:31', '12:55', price=156, seats=37, service_name='')])
services = _parse_graphql(data, 'Paris Gare du Nord')
assert services[0]['train_number'] == ''
def test_parse_empty_departures():
html = _make_next_data([])
assert _parse(html, 'Paris Gare du Nord') == []
def test_parse_graphql_empty_journeys():
data = _gql_response([])
assert _parse_graphql(data, 'Paris Gare du Nord') == []
def test_timetable_url_uses_station_id_table():
assert timetable_url('Paris Gare du Nord') == (
'https://www.eurostar.com/uk-en/travel-info/timetable/'
'7015400/8727100/london-st-pancras-intl/paris-gare-du-nord'
)
def test_timetable_url_slugifies_destination_name():
assert timetable_url('Rotterdam Centraal') == (
'https://www.eurostar.com/uk-en/travel-info/timetable/'
'7015400/8400530/london-st-pancras-intl/rotterdam-centraal'
# ---------------------------------------------------------------------------
# search_url
# ---------------------------------------------------------------------------
def test_search_url():
url = search_url('Paris Gare du Nord', '2026-04-10')
assert url == (
'https://www.eurostar.com/search/uk-en'
'?adult=1&origin=7015400&destination=8727100&outbound=2026-04-10'
)