agenda/agenda/airbnb.py

191 lines
5.9 KiB
Python

"""Library for parsing Airbnb booking HTML files."""
import json
import re
import typing
from datetime import datetime
from typing import Any
from zoneinfo import ZoneInfo
import lxml.html
import pycountry
StrDict = dict[str, typing.Any]
def build_datetime(date_str: str, time_str: str, tz_name: str) -> datetime:
"""
Combine an ISO date string, HH:MM time string, and a timezone name
into a timezone-aware datetime in the specified timezone.
"""
dt_str = f"{date_str}T{time_str}"
naive_dt = datetime.fromisoformat(dt_str)
return naive_dt.replace(tzinfo=ZoneInfo(tz_name))
def list_to_dict(items: list[typing.Any]) -> dict[str, typing.Any]:
"""Convert a flat list to a dict, assuming alternating keys and values."""
return {items[i]: items[i + 1] for i in range(0, len(items), 2)}
def extract_country_code(address: str) -> str | None:
"""Return ISO 3166-1 alpha-2 country code from a free-text address."""
address_lower = address.lower()
for country in pycountry.countries:
if country.name.lower() in address_lower:
return str(country.alpha_2.lower())
if (
hasattr(country, "official_name")
and country.official_name.lower() in address_lower
):
return str(country.alpha_2.lower())
return None
def get_json_blob(tree: Any) -> str:
data_id = "data-injector-instances"
js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0]
return str(js_string)
def get_ui_state(tree: Any) -> StrDict:
data_id = "data-injector-instances"
js_string = tree.xpath(f'//*[@id="{data_id}"]/text()')[0]
big_blob = json.loads(str(js_string))
ui_state = walk_tree(big_blob, "uiState")
return list_to_dict(ui_state[0])
def get_reservation_data(ui_state: StrDict) -> StrDict:
return {
row["id"]: row for row in ui_state["reservation"]["scheduled_event"]["rows"]
}
def get_room_url(tree: Any) -> str | None:
for e in tree.xpath('//a[@data-testid="reservation-destination-link"]'):
href = e.get("href")
assert isinstance(href, str)
if not href.startswith("/room"):
continue
return "https://www.airbnb.co.uk" + href
return None
def get_price_from_reservation(reservation: StrDict) -> str:
price = reservation["payment_summary"]["subtitle"]
assert isinstance(price, str)
tc = "Total cost: "
if price.startswith(tc):
price = price[len(tc) :]
assert price[0] == "£"
return price[1:]
def extract_booking_from_html(html_file: str) -> StrDict:
"""Extract booking information from Airbnb HTML file."""
with open(html_file, "r", encoding="utf-8") as f:
text_content = f.read()
confirmation_match = re.search(
r"/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/([A-Z0-9]+)",
text_content,
)
if confirmation_match is None:
raise ValueError("Could not find confirmation code in HTML")
confirmation_code = confirmation_match.group(1)
tree = lxml.html.parse(html_file)
root = tree.getroot()
try:
ui_state = get_ui_state(tree)
except Exception:
print(html_file)
raise
reservation = get_reservation_data(ui_state)
m_guests = re.match(r"^(\d+) guests?$", reservation["guests"]["subtitle"])
if m_guests is None:
raise ValueError("Could not parse number of guests")
number_of_adults = int(m_guests.group(1))
price = get_price_from_reservation(reservation)
metadata = ui_state["reservation"]["metadata"]
country_code = metadata["country"].lower()
title = reservation["dynamic_marquee_title_image_v3"]["title"]
location = title.rpartition(" in ")[2]
checkin_checkout = reservation["checkin_checkout_arrival_guide"]
check_in_time = checkin_checkout["leading_subtitle"]
check_out_time = checkin_checkout["trailing_subtitle"]
check_in = build_datetime(
metadata["check_in_date"], check_in_time, metadata["timezone"]
)
check_out = build_datetime(
metadata["check_out_date"], check_out_time, metadata["timezone"]
)
address = reservation["map"]["address"] if "map" in reservation else None
if "header_action.pdp" in reservation:
name = reservation["header_action.pdp"]["subtitle"]
else:
name = root.findtext(".//h1")
booking = {
"type": "apartment",
"operator": "airbnb",
"name": name,
"location": location,
"booking_reference": confirmation_code,
"booking_url": f"https://www.airbnb.co.uk/trips/v1/reservation-details/ro/RESERVATION2_CHECKIN/{confirmation_code}",
"country": country_code,
"latitude": metadata["lat"],
"longitude": metadata["lng"],
"timezone": metadata["timezone"],
"from": check_in,
"to": check_out,
"price": price,
"currency": "GBP",
"number_of_adults": number_of_adults,
}
if address:
booking["address"] = address
room_url = get_room_url(tree)
if room_url is not None:
booking["url"] = room_url
return booking
def walk_tree(data: Any, want_key: str) -> Any:
"""Recursively search for a dict containing 'reservation' and return its value."""
if isinstance(data, dict):
if want_key in data:
return data[want_key]
for key, value in data.items():
result = walk_tree(value, want_key)
if result is not None:
return result
elif isinstance(data, list):
for item in data:
result = walk_tree(item, want_key)
if result is not None:
return result
return None
def parse_multiple_files(filenames: list[str]) -> list[StrDict]:
"""Parse multiple Airbnb HTML files and return a list of booking dictionaries."""
bookings = []
for html_file in sorted(filenames):
booking = extract_booking_from_html(html_file)
assert booking
bookings.append(booking)
return bookings