conference-check/check.py
2024-02-25 17:23:38 +00:00

176 lines
5.4 KiB
Python
Executable file

#!/usr/bin/python3
"""Check if conference websites are live."""
import configparser
import os
import re
import smtplib
import typing
import warnings
from datetime import date
from email.mime.text import MIMEText
from email.utils import formatdate, make_msgid
import requests
import yaml
from requests.adapters import HTTPAdapter
from urllib3.exceptions import InsecureRequestWarning # type: ignore
from urllib3.util.url import parse_url # type: ignore
class AbsoluteDNSAdapter(HTTPAdapter):
"""A custom adapter for requests to ensure hostnames are treated as absolute."""
def add_dot_to_hostname(self, url: str) -> str:
"""Append a dot to the hostname to treat it as an absolute domain name."""
parsed_url = parse_url(url)
# Append a dot to the hostname if it's not already there.
hostname = parsed_url.host
if not hostname.endswith("."):
hostname += "."
# Reconstruct the URL with the modified hostname.
new_url: str = parsed_url._replace(host=hostname).url
return new_url
def send(self, request, **kwargs): # type: ignore
"""Override the send method to modify the request URL before sending."""
# Modify the request URL to ensure the hostname is treated as absolute.
request.url = self.add_dot_to_hostname(request.url)
return super().send(request, **kwargs)
config_file_path = os.path.expanduser(
os.path.join(
os.getenv("XDG_CONFIG_HOME", "~/.config"), "conference-check", "config"
)
)
config = configparser.ConfigParser()
config.read(os.path.expanduser(config_file_path))
# Suppress only the single InsecureRequestWarning from urllib3
warnings.filterwarnings("ignore", category=InsecureRequestWarning)
re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
AGENT = "Mozilla/5.0 (Windows NT 6.1) Gecko/20100101 Firefox/29.0"
headers = {"User-Agent": AGENT, "Accept": "text/html"}
s = requests.Session()
s.headers.update(headers)
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
adapter = AbsoluteDNSAdapter()
s.mount("http://", adapter)
s.mount("https://", adapter)
not_here_list = [
"The specified URL was not found.",
"There is currently no text in this page.",
"This page does not exist yet",
"404 Not Found",
"500 Internal Server Error",
"Test Page for the Apache HTTP Server",
"Site not found &middot; GitHub Pages",
"504: Gateway time-out",
"This page doesn't exist (404)",
"Coming soon",
"NOT_FOUND",
"Resource Not Found",
]
def find_not_here_message(html: str) -> str | None:
"""Find not here message in web page."""
return next((not_here for not_here in not_here_list if not_here in html), None)
def get_title(html: str) -> str:
"""Title from web page."""
m = re_title.search(html)
return m.group(1).strip() if m and m.group(1) else "no title"
def check_conference(name: str, url: str) -> tuple[bool, str]:
"""Check if conference is live."""
try:
# SotM Baltics has an invalid TLS certificate, but we don't care
r = s.get(url, verify=False)
except requests.exceptions.ConnectionError:
return (False, "connection refused")
not_here = find_not_here_message(r.text)
return (False, not_here) if not_here else (True, get_title(r.text))
def send_mail(subject: str, body: str) -> None:
"""Send an e-mail."""
mail_from_address = config["mail"]["from_address"]
mail_from_name = config["mail"]["from_name"]
mail_to_address = config["mail"]["to_address"]
mail_to_name = config["mail"]["to_name"]
msg = MIMEText(body, "plain", "UTF-8")
msg["Subject"] = subject
msg["To"] = f"{mail_to_name} <{mail_to_address}>"
msg["From"] = f"{mail_from_name} <{mail_from_address}>"
msg["Date"] = formatdate()
msg["Message-ID"] = make_msgid()
# extra mail headers from config
for header_name, value in config["mail_headers"].items():
msg[header_name] = value
s = smtplib.SMTP(config["mail"]["smtp_host"])
s.sendmail(mail_from_address, [mail_to_address], msg.as_string())
s.quit()
def load_yaml(name: str) -> typing.Any:
"""Load YAML."""
filename = os.path.expanduser(config["data"][name])
assert os.path.exists(filename)
return yaml.safe_load(open(filename))
def main(show_not_live: bool = False) -> None:
"""Check each conference."""
today = date.today()
this_year = today.year
conferences = load_yaml("conferences")
live_conferences = load_yaml("live")
live_set = {(c["conference"], c["year"]) for c in live_conferences}
new_live = False
for name, src_url in conferences.items():
for year in this_year, this_year + 1:
if (name, year) in live_set:
continue
assert "{year}" in src_url
url = src_url.format(year=year)
live, msg = check_conference(name, url)
if not live:
continue
body = f"{name}\n{url}\nWeb page title: {msg}"
send_mail(f"Conference site live: {name}", body)
new_live = True
live_conferences.append({"conference": name, "year": year, "live": today})
if new_live:
live_filename = os.path.expanduser(config["data"]["live"])
with open(live_filename, "w") as out:
yaml.dump(live_conferences, stream=out, sort_keys=False)
if __name__ == "__main__":
main()