agenda/parse_airbnb.py

211 lines
6 KiB
Python
Executable file

#!/usr/bin/python3
import json
import re
import sys
import typing
from datetime import datetime
from typing import Any
from zoneinfo import ZoneInfo
import lxml.html
import pycountry
import yaml
StrDict = dict[str, typing.Any]
def build_datetime(date_str: str, time_str: str, tz_name: str) -> datetime:
"""
Combine an ISO date string, HH:MM time string, and a timezone name
into a timezone-aware datetime in the specified timezone.
"""
dt_str = f"{date_str}T{time_str}"
naive_dt = datetime.fromisoformat(dt_str)
return naive_dt.replace(tzinfo=ZoneInfo(tz_name))
def list_to_dict(items: list) -> dict[str, int]:
"""Convert a flat list to a dict, assuming alternating keys and values."""
return {items[i]: items[i + 1] for i in range(0, len(items), 2)}
def extract_country_code(address: str) -> str | None:
"""Return ISO 3166-1 alpha-2 country code from a free-text address."""
address_lower = address.lower()
for country in pycountry.countries:
if country.name.lower() in address_lower:
return str(country.alpha_2.lower())
if (
hasattr(country, "official_name")
and country.official_name.lower() in address_lower
):
return str(country.alpha_2.lower())
return None
def get_json_blob(tree) -> str:
data_id = "data-injector-instances"
js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0]
return js_string
def get_ui_state(tree) -> StrDict:
data_id = "data-injector-instances"
js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0]
# print(js_string)
# sys.exit(0)
big_blob = json.loads(js_string)
ui_state = walk_tree(big_blob, "uiState")
# print(json.dumps(ui_state))
return list_to_dict(ui_state[0])
def get_reservation_data(ui_state: StrDict) -> StrDict:
return {
row["id"]: row for row in ui_state["reservation"]["scheduled_event"]["rows"]
}
def get_room_url(tree) -> str:
for e in tree.xpath('//a[@data-testid="reservation-destination-link"]'):
href = e.get("href")
assert isinstance(href, str)
if not href.startswith("/room"):
continue
return "https://www.airbnb.co.uk" + href
def get_price_from_reservation(reservation: StrDict) -> str:
price_string = reservation["payment_summary"]["subtitle"]
assert isinstance(price_string, str)
tc = "Total cost: "
assert price_string.startswith(tc)
price = price_string[len(tc) :]
assert price[0] == "£"
return price[1:]
def extract_booking_from_html(html_file: str) -> StrDict:
"""Extract booking information from Airbnb HTML file."""
with open(html_file, "r", encoding="utf-8") as f:
text_content = f.read()
confirmation_code = re.search(
r"/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/([A-Z0-9]+)",
text_content,
).group(1)
tree = lxml.html.parse(html_file)
root = tree.getroot()
try:
ui_state = get_ui_state(tree)
except Exception:
print(html_file)
raise
# print(json.dumps(ui_state))
reservation = get_reservation_data(ui_state)
m_guests = re.match(r"^(\d+) guests?$", reservation["guests"]["subtitle"])
number_of_adults = int(m_guests.group(1))
price = get_price_from_reservation(reservation)
# print(reservation["payment_summary"]["subtitle"])
# print(json.dumps(reservation))
metadata = ui_state["reservation"]["metadata"]
country_code = metadata["country"].lower()
# pprint(metadata)
# print(json.dumps(x))
title = reservation["dynamic_marquee_title_image_v3"]["title"]
location = title.rpartition(" in ")[2]
# print(json.dumps(reservation))
checkin_checkout = reservation["checkin_checkout_arrival_guide"]
# pprint(checkin_checkout)
check_in_time = checkin_checkout["leading_subtitle"]
check_out_time = checkin_checkout["trailing_subtitle"]
check_in = build_datetime(
metadata["check_in_date"], check_in_time, metadata["timezone"]
)
check_out = build_datetime(
metadata["check_out_date"], check_out_time, metadata["timezone"]
)
# print(check_in, check_out)
address = reservation["map"]["address"]
# country_code = extract_country_code(address)
# if "header_action.pdp" not in reservation:
# pprint(reservation)
if "header_action.pdp" in reservation:
name = reservation["header_action.pdp"]["subtitle"]
else:
name = root.findtext(".//h1")
booking = {
"type": "apartment",
"operator": "airbnb",
"name": name,
"location": location,
"booking_reference": confirmation_code,
"booking_url": f"https://www.airbnb.co.uk/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/{confirmation_code}",
"address": address,
"country": country_code,
"latitude": metadata["lat"],
"longitude": metadata["lng"],
"timezone": metadata["timezone"],
"from": check_in,
"to": check_out,
"price": price,
"currency": "GBP",
"number_of_adults": number_of_adults,
}
booking["url"] = get_room_url(tree)
return booking
def walk_tree(data: Any, want_key: str) -> Any:
"""Recursively search for a dict containing 'reservation' and return its value."""
if isinstance(data, dict):
if want_key in data:
return data[want_key]
for key, value in data.items():
result = walk_tree(value, want_key)
if result is not None:
return result
elif isinstance(data, list):
for item in data:
result = walk_tree(item, want_key)
if result is not None:
return result
return None
def main() -> None:
"""Main function."""
filenames = sorted(sys.argv[1:])
bookings = []
for html_file in filenames:
booking = extract_booking_from_html(html_file)
assert booking
bookings.append(booking)
print(yaml.dump(bookings, sort_keys=False))
if __name__ == "__main__":
main()