Split waste_schedule.py in two

This commit is contained in:
Edward Betts 2024-07-14 22:19:13 +08:00
parent 17eca6a95a
commit 0f3f596cb3
5 changed files with 107 additions and 93 deletions

View file

@ -2,103 +2,18 @@
import json import json
import os import os
import re
import typing import typing
from collections import defaultdict from collections import defaultdict
from datetime import date, datetime, time, timedelta from datetime import date, datetime, timedelta
import httpx import httpx
import lxml.html
from . import uk_time
from .types import Event from .types import Event
from .utils import make_waste_dir
ttl_hours = 12 ttl_hours = 12
def make_waste_dir(data_dir: str) -> None:
"""Make waste dir if missing."""
waste_dir = os.path.join(data_dir, "waste")
if not os.path.exists(waste_dir):
os.mkdir(waste_dir)
async def get_html(data_dir: str, postcode: str, uprn: str) -> str:
"""Get waste schedule."""
now = datetime.now()
waste_dir = os.path.join(data_dir, "waste")
make_waste_dir(data_dir)
existing_data = os.listdir(waste_dir)
existing = [f for f in existing_data if f.endswith(".html")]
if existing:
recent_filename = max(existing)
recent = datetime.strptime(recent_filename, "%Y-%m-%d_%H:%M.html")
delta = now - recent
if existing and delta < timedelta(hours=ttl_hours):
return open(os.path.join(waste_dir, recent_filename)).read()
now_str = now.strftime("%Y-%m-%d_%H:%M")
filename = f"{waste_dir}/{now_str}.html"
forms_base_url = "https://forms.n-somerset.gov.uk"
url = "https://forms.n-somerset.gov.uk/Waste/CollectionSchedule"
async with httpx.AsyncClient() as client:
r = await client.post(
url,
data={
"PreviousHouse": "",
"PreviousPostcode": "-",
"Postcode": postcode,
"SelectedUprn": uprn,
},
)
form_post_html = r.text
pattern = r'<h2>Object moved to <a href="([^"]*)">here<\/a>\.<\/h2>'
m = re.search(pattern, form_post_html)
if m:
r = await client.get(forms_base_url + m.group(1))
html = r.text
open(filename, "w").write(html)
return html
def parse_waste_schedule_date(day_and_month: str) -> date:
"""Parse waste schedule date."""
today = date.today()
fmt = "%A %d %B %Y"
d = datetime.strptime(f"{day_and_month} {today.year}", fmt).date()
if d < today:
d = datetime.strptime(f"{day_and_month} {today.year + 1}", fmt).date()
return d
def parse(root: lxml.html.HtmlElement) -> list[Event]:
"""Parse waste schedule."""
tbody = root.find(".//table/tbody")
assert tbody is not None
by_date = defaultdict(list)
for e_service, e_next_date, e_following in tbody:
assert e_service.text and e_next_date.text and e_following.text
service = e_service.text
next_date = parse_waste_schedule_date(e_next_date.text)
following_date = parse_waste_schedule_date(e_following.text)
by_date[next_date].append(service)
by_date[following_date].append(service)
return [
Event(
name="waste_schedule",
date=uk_time(d, time(6, 30)),
title="Backwell: " + ", ".join(services),
)
for d, services in by_date.items()
]
BristolSchedule = list[dict[str, typing.Any]] BristolSchedule = list[dict[str, typing.Any]]

View file

@ -15,6 +15,7 @@ import pytz
from . import ( from . import (
accommodation, accommodation,
birthday, birthday,
bristol_waste,
busy, busy,
carnival, carnival,
conference, conference,
@ -26,13 +27,13 @@ from . import (
hn, hn,
holidays, holidays,
meetup, meetup,
n_somerset_waste,
stock_market, stock_market,
subscription, subscription,
sun, sun,
thespacedevs, thespacedevs,
travel, travel,
uk_holiday, uk_holiday,
waste_schedule,
) )
from .types import Event, StrDict from .types import Event, StrDict
@ -62,9 +63,9 @@ async def waste_collection_events(
data_dir: str, postcode: str, uprn: str data_dir: str, postcode: str, uprn: str
) -> list[Event]: ) -> list[Event]:
"""Waste colllection events.""" """Waste colllection events."""
html = await waste_schedule.get_html(data_dir, postcode, uprn) html = await n_somerset_waste.get_html(data_dir, postcode, uprn)
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
events = waste_schedule.parse(root) events = n_somerset_waste.parse(root)
return events return events
@ -72,7 +73,7 @@ async def bristol_waste_collection_events(
data_dir: str, start_date: date, uprn: str data_dir: str, start_date: date, uprn: str
) -> list[Event]: ) -> list[Event]:
"""Waste colllection events.""" """Waste colllection events."""
return await waste_schedule.get_bristol_gov_uk(start_date, data_dir, uprn) return await bristol_waste.get_bristol_gov_uk(start_date, data_dir, uprn)
def find_events_during_stay( def find_events_during_stay(

View file

@ -0,0 +1,91 @@
"""Waste collection schedules."""
import os
import re
from collections import defaultdict
from datetime import date, datetime, time, timedelta
import httpx
import lxml.html
from . import uk_time
from .types import Event
from .utils import make_waste_dir
ttl_hours = 12
async def get_html(data_dir: str, postcode: str, uprn: str) -> str:
"""Get waste schedule."""
now = datetime.now()
waste_dir = os.path.join(data_dir, "waste")
make_waste_dir(data_dir)
existing_data = os.listdir(waste_dir)
existing = [f for f in existing_data if f.endswith(".html")]
if existing:
recent_filename = max(existing)
recent = datetime.strptime(recent_filename, "%Y-%m-%d_%H:%M.html")
delta = now - recent
if existing and delta < timedelta(hours=ttl_hours):
return open(os.path.join(waste_dir, recent_filename)).read()
now_str = now.strftime("%Y-%m-%d_%H:%M")
filename = f"{waste_dir}/{now_str}.html"
forms_base_url = "https://forms.n-somerset.gov.uk"
url = "https://forms.n-somerset.gov.uk/Waste/CollectionSchedule"
async with httpx.AsyncClient() as client:
r = await client.post(
url,
data={
"PreviousHouse": "",
"PreviousPostcode": "-",
"Postcode": postcode,
"SelectedUprn": uprn,
},
)
form_post_html = r.text
pattern = r'<h2>Object moved to <a href="([^"]*)">here<\/a>\.<\/h2>'
m = re.search(pattern, form_post_html)
if m:
r = await client.get(forms_base_url + m.group(1))
html = r.text
open(filename, "w").write(html)
return html
def parse_waste_schedule_date(day_and_month: str) -> date:
"""Parse waste schedule date."""
today = date.today()
fmt = "%A %d %B %Y"
d = datetime.strptime(f"{day_and_month} {today.year}", fmt).date()
if d < today:
d = datetime.strptime(f"{day_and_month} {today.year + 1}", fmt).date()
return d
def parse(root: lxml.html.HtmlElement) -> list[Event]:
"""Parse waste schedule."""
tbody = root.find(".//table/tbody")
assert tbody is not None
by_date = defaultdict(list)
for e_service, e_next_date, e_following in tbody:
assert e_service.text and e_next_date.text and e_following.text
service = e_service.text
next_date = parse_waste_schedule_date(e_next_date.text)
following_date = parse_waste_schedule_date(e_following.text)
by_date[next_date].append(service)
by_date[following_date].append(service)
return [
Event(
name="waste_schedule",
date=uk_time(d, time(6, 30)),
title="Backwell: " + ", ".join(services),
)
for d, services in by_date.items()
]

View file

@ -77,3 +77,10 @@ def get_most_recent_file(directory: str, ext: str) -> str | None:
return None return None
existing.sort(reverse=True) existing.sort(reverse=True)
return os.path.join(directory, existing[0][1]) return os.path.join(directory, existing[0][1])
def make_waste_dir(data_dir: str) -> None:
"""Make waste dir if missing."""
waste_dir = os.path.join(data_dir, "waste")
if not os.path.exists(waste_dir):
os.mkdir(waste_dir)

View file

@ -13,6 +13,7 @@ import deepdiff # type: ignore
import flask import flask
import requests import requests
import agenda.bristol_waste
import agenda.fx import agenda.fx
import agenda.geomob import agenda.geomob
import agenda.gwr import agenda.gwr
@ -20,7 +21,6 @@ import agenda.mail
import agenda.thespacedevs import agenda.thespacedevs
import agenda.types import agenda.types
import agenda.uk_holiday import agenda.uk_holiday
import agenda.waste_schedule
from agenda.types import StrDict from agenda.types import StrDict
from web_view import app from web_view import app
@ -39,7 +39,7 @@ async def update_bank_holidays(config: flask.config.Config) -> None:
async def update_bristol_bins(config: flask.config.Config) -> None: async def update_bristol_bins(config: flask.config.Config) -> None:
"""Update waste schedule from Bristol City Council.""" """Update waste schedule from Bristol City Council."""
t0 = time() t0 = time()
events = await agenda.waste_schedule.get_bristol_gov_uk( events = await agenda.bristol_waste.get_bristol_gov_uk(
date.today(), config["DATA_DIR"], config["BRISTOL_UPRN"], refresh=True date.today(), config["DATA_DIR"], config["BRISTOL_UPRN"], refresh=True
) )
time_taken = time() - t0 time_taken = time() - t0