conference-check/check.py

222 lines
6.9 KiB
Python
Raw Normal View History

2023-09-06 07:57:12 +01:00
#!/usr/bin/python3
"""Check if conference websites are live."""
import os
2023-09-06 07:57:12 +01:00
import re
import smtplib
import warnings
from datetime import date
2023-09-06 07:57:12 +01:00
from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid
2024-10-13 16:27:38 +01:00
from urllib.parse import urlparse, urlunparse
2023-09-06 07:57:12 +01:00
import requests
import yaml
2024-02-25 15:15:38 +00:00
from requests.adapters import HTTPAdapter
2024-07-14 16:28:16 +01:00
from urllib3.exceptions import InsecureRequestWarning
from urllib3.util.url import parse_url
2024-02-25 15:15:38 +00:00
from conference import LiveConference, config, load_yaml
2024-02-25 15:15:38 +00:00
class AbsoluteDNSAdapter(HTTPAdapter):
"""A custom adapter for requests to ensure hostnames are treated as absolute."""
def add_dot_to_hostname(self, url: str) -> str:
"""Append a dot to the hostname to treat it as an absolute domain name."""
parsed_url = parse_url(url)
# Append a dot to the hostname if it's not already there.
hostname = parsed_url.host
2024-07-14 17:55:18 +01:00
assert hostname
2024-02-25 15:15:38 +00:00
if not hostname.endswith("."):
hostname += "."
# Reconstruct the URL with the modified hostname.
new_url: str = parsed_url._replace(host=hostname).url
return new_url
def send(self, request, **kwargs): # type: ignore
"""Override the send method to modify the request URL before sending."""
# Modify the request URL to ensure the hostname is treated as absolute.
request.url = self.add_dot_to_hostname(request.url)
return super().send(request, **kwargs)
# Suppress only the single InsecureRequestWarning from urllib3
warnings.filterwarnings("ignore", category=InsecureRequestWarning)
2023-09-06 07:57:12 +01:00
2023-09-11 07:40:16 +01:00
re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
2023-09-06 07:57:12 +01:00
2024-03-21 08:57:38 +00:00
AGENT = config["browser"]["User-Agent"]
2023-09-06 07:57:12 +01:00
headers = {"User-Agent": AGENT, "Accept": "text/html"}
s = requests.Session()
s.headers.update(headers)
2024-02-25 15:15:38 +00:00
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
adapter = AbsoluteDNSAdapter()
s.mount("http://", adapter)
s.mount("https://", adapter)
2023-09-06 07:57:12 +01:00
not_here_list = [
"The specified URL was not found.",
"There is currently no text in this page.",
"This page does not exist yet",
"404 Not Found",
2023-09-07 16:14:19 +01:00
"500 Internal Server Error",
"Test Page for the Apache HTTP Server",
"Site not found &middot; GitHub Pages",
2023-11-04 22:30:34 +00:00
"504: Gateway time-out",
2024-10-13 16:27:01 +01:00
"504 Gateway Time-out",
2024-07-11 09:07:42 +01:00
"502 Bad Gateway",
2024-02-11 06:12:32 +00:00
"This page doesn't exist (404)",
"Coming soon",
"NOT_FOUND",
"Resource Not Found",
"Wikimedia Error",
2024-07-11 09:07:42 +01:00
"The page you requested could not be found",
"Ooops! Could Not Find It",
"OpenStreetMap Authentication Proxy",
2024-09-19 13:08:27 +01:00
"Error 404",
"Under Construction",
"Page not found",
"Error 404: Page not found",
"Barcamptools",
"That page cant be found.",
"looks like there's no page here",
"404 page",
"Database Error",
2024-10-13 16:27:01 +01:00
"You are not authorized to access this page",
"Attention Required! | Cloudflare",
2023-09-06 07:57:12 +01:00
]
def find_not_here_message(html: str) -> str | None:
"""Find not here message in web page."""
return next((not_here for not_here in not_here_list if not_here in html), None)
def get_title(html: str) -> str:
"""Title from web page."""
m = re_title.search(html)
2023-09-11 07:40:16 +01:00
return m.group(1).strip() if m and m.group(1) else "no title"
2023-09-06 07:57:12 +01:00
2024-10-13 16:27:38 +01:00
def normalize_url(url: str) -> str:
"""
Normalize the URL by parsing and reconstructing to ensure uniformity.
This handles cases like differing schemes, casing in the domain
and trailing slashes.
"""
# Parse the URL into components
parsed_url = urlparse(url)
# Normalize the domain to lowercase and remove any trailing dot
normalized_netloc = parsed_url.netloc.lower().rstrip(".")
# Reconstruct the URL with normalized components
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
def check_conference(
name: str, src_url: str, year: int
) -> tuple[bool, str, str | None]:
2023-09-06 07:57:12 +01:00
"""Check if conference is live."""
2024-10-13 16:27:38 +01:00
url = src_url.format(year=year)
past_url = src_url.format(year=year - 1)
2023-09-06 07:57:12 +01:00
try:
# SotM Baltics has an invalid TLS certificate, but we don't care
r = s.get(url, verify=False)
2023-09-06 07:57:12 +01:00
except requests.exceptions.ConnectionError:
return (False, "connection refused", None)
2023-09-06 07:57:12 +01:00
not_here = find_not_here_message(r.text)
2024-10-13 16:27:38 +01:00
if (
len(r.text) < 2048
and 'http-equiv="refresh"' in r.text
and str(year) not in r.text
):
return (False, "redirect to URL without year", r.url)
2024-10-13 16:27:38 +01:00
if normalize_url(r.url) == normalize_url(past_url):
return (False, "redirect to previous year", r.url)
2024-10-13 16:27:38 +01:00
return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
2023-09-06 07:57:12 +01:00
def send_mail(subject: str, body: str) -> None:
"""Send an e-mail."""
2024-02-25 17:20:26 +00:00
mail_from_address = config["mail"]["from_address"]
mail_from_name = config["mail"]["from_name"]
mail_to_address = config["mail"]["to_address"]
mail_to_name = config["mail"]["to_name"]
2023-09-06 07:57:12 +01:00
msg = MIMEText(body, "plain", "UTF-8")
msg["Subject"] = subject
2024-02-25 17:20:26 +00:00
msg["To"] = f"{mail_to_name} <{mail_to_address}>"
msg["From"] = f"{mail_from_name} <{mail_from_address}>"
2023-09-06 07:57:12 +01:00
msg["Date"] = formatdate()
msg["Message-ID"] = make_msgid()
# extra mail headers from config
for header_name, value in config["mail_headers"].items():
msg[header_name] = value
s = smtplib.SMTP(config["mail"]["smtp_host"])
s.sendmail(mail_from_address, [mail_to_address], msg.as_string())
2023-09-06 07:57:12 +01:00
s.quit()
def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
"""Check if an individual web site is live."""
assert "{year}" in src_url
live, msg, redirect_to_url = check_conference(name, src_url, year)
2024-10-13 16:27:38 +01:00
url = src_url.format(year=year)
if live:
if redirect_to_url == url:
body = f"{name}\n{url}\nWeb page title: {msg}"
else:
body = f"""{name}
{url} redirects to {redirect_to_url}
Web page title: {msg}"""
send_mail(f"Conference site live: {name} - {year}", body)
return live
def find_new_conference_web_sites(
today: date, live: list[LiveConference]
) -> list[LiveConference]:
"""Check for new conference web sites going live."""
this_year = today.year
new: list[LiveConference] = []
live_set = {(c["conference"], c["year"]) for c in live}
for name, src_url in load_yaml("conferences").items():
new += [
{"conference": name, "year": year, "live": today}
for year in (this_year, this_year + 1)
if (name, year) not in live_set
and check_conference_web_site(name, src_url, year)
]
return new
def main(show_not_live: bool = False) -> None:
"""Check fow new conference web sites."""
live: list[LiveConference] = load_yaml("live")
if not (new := find_new_conference_web_sites(date.today(), live)):
return
live_filename = os.path.expanduser(config["data"]["live"])
with open(live_filename, "w") as out:
yaml.dump(live + new, stream=out, sort_keys=False)
2023-09-06 07:57:12 +01:00
if __name__ == "__main__":
main()