2023-09-06 07:57:12 +01:00
|
|
|
|
#!/usr/bin/python3
|
|
|
|
|
|
|
|
|
|
"""Check if conference websites are live."""
|
|
|
|
|
|
2024-02-11 06:56:57 +00:00
|
|
|
|
import os
|
2023-09-06 07:57:12 +01:00
|
|
|
|
import re
|
|
|
|
|
import smtplib
|
2023-10-29 11:52:15 +00:00
|
|
|
|
import warnings
|
2024-02-25 17:23:38 +00:00
|
|
|
|
from datetime import date
|
2023-09-06 07:57:12 +01:00
|
|
|
|
from email.mime.text import MIMEText
|
|
|
|
|
from email.utils import formatdate, make_msgid
|
2024-10-13 16:27:38 +01:00
|
|
|
|
from urllib.parse import urlparse, urlunparse
|
2023-09-06 07:57:12 +01:00
|
|
|
|
|
|
|
|
|
import requests
|
2024-02-25 17:23:38 +00:00
|
|
|
|
import yaml
|
2024-02-25 15:15:38 +00:00
|
|
|
|
from requests.adapters import HTTPAdapter
|
2024-07-14 16:28:16 +01:00
|
|
|
|
from urllib3.exceptions import InsecureRequestWarning
|
|
|
|
|
from urllib3.util.url import parse_url
|
2024-02-25 15:15:38 +00:00
|
|
|
|
|
2024-07-21 02:53:22 +01:00
|
|
|
|
from conference import LiveConference, config, load_yaml
|
2024-03-22 09:31:21 +00:00
|
|
|
|
|
|
|
|
|
|
2024-02-25 15:15:38 +00:00
|
|
|
|
class AbsoluteDNSAdapter(HTTPAdapter):
|
|
|
|
|
"""A custom adapter for requests to ensure hostnames are treated as absolute."""
|
|
|
|
|
|
|
|
|
|
def add_dot_to_hostname(self, url: str) -> str:
|
|
|
|
|
"""Append a dot to the hostname to treat it as an absolute domain name."""
|
|
|
|
|
parsed_url = parse_url(url)
|
|
|
|
|
|
|
|
|
|
# Append a dot to the hostname if it's not already there.
|
|
|
|
|
hostname = parsed_url.host
|
2024-07-14 17:55:18 +01:00
|
|
|
|
assert hostname
|
2024-02-25 15:15:38 +00:00
|
|
|
|
if not hostname.endswith("."):
|
|
|
|
|
hostname += "."
|
|
|
|
|
|
|
|
|
|
# Reconstruct the URL with the modified hostname.
|
|
|
|
|
new_url: str = parsed_url._replace(host=hostname).url
|
|
|
|
|
return new_url
|
|
|
|
|
|
|
|
|
|
def send(self, request, **kwargs): # type: ignore
|
|
|
|
|
"""Override the send method to modify the request URL before sending."""
|
|
|
|
|
# Modify the request URL to ensure the hostname is treated as absolute.
|
|
|
|
|
request.url = self.add_dot_to_hostname(request.url)
|
|
|
|
|
return super().send(request, **kwargs)
|
|
|
|
|
|
2024-02-11 06:56:57 +00:00
|
|
|
|
|
2023-10-29 11:52:15 +00:00
|
|
|
|
# Suppress only the single InsecureRequestWarning from urllib3
|
|
|
|
|
warnings.filterwarnings("ignore", category=InsecureRequestWarning)
|
|
|
|
|
|
2023-09-06 07:57:12 +01:00
|
|
|
|
|
2023-09-11 07:40:16 +01:00
|
|
|
|
re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
|
2023-09-06 07:57:12 +01:00
|
|
|
|
|
2024-03-21 08:57:38 +00:00
|
|
|
|
AGENT = config["browser"]["User-Agent"]
|
2023-09-06 07:57:12 +01:00
|
|
|
|
headers = {"User-Agent": AGENT, "Accept": "text/html"}
|
|
|
|
|
|
|
|
|
|
s = requests.Session()
|
|
|
|
|
s.headers.update(headers)
|
|
|
|
|
|
2024-02-25 15:15:38 +00:00
|
|
|
|
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
|
|
|
|
|
adapter = AbsoluteDNSAdapter()
|
|
|
|
|
s.mount("http://", adapter)
|
|
|
|
|
s.mount("https://", adapter)
|
|
|
|
|
|
|
|
|
|
|
2023-09-06 07:57:12 +01:00
|
|
|
|
not_here_list = [
|
|
|
|
|
"The specified URL was not found.",
|
|
|
|
|
"There is currently no text in this page.",
|
|
|
|
|
"This page does not exist yet",
|
|
|
|
|
"404 Not Found",
|
2023-09-07 16:14:19 +01:00
|
|
|
|
"500 Internal Server Error",
|
2023-10-29 11:52:15 +00:00
|
|
|
|
"Test Page for the Apache HTTP Server",
|
2023-10-29 18:19:12 +00:00
|
|
|
|
"Site not found · GitHub Pages",
|
2023-11-04 22:30:34 +00:00
|
|
|
|
"504: Gateway time-out",
|
2024-10-13 16:27:01 +01:00
|
|
|
|
"504 Gateway Time-out",
|
2024-07-11 09:07:42 +01:00
|
|
|
|
"502 Bad Gateway",
|
2024-02-11 06:12:32 +00:00
|
|
|
|
"This page doesn't exist (404)",
|
|
|
|
|
"Coming soon",
|
|
|
|
|
"NOT_FOUND",
|
2024-02-25 17:23:38 +00:00
|
|
|
|
"Resource Not Found",
|
2024-03-26 07:15:02 +00:00
|
|
|
|
"Wikimedia Error",
|
2024-07-11 09:07:42 +01:00
|
|
|
|
"The page you requested could not be found",
|
|
|
|
|
"Ooops! Could Not Find It",
|
2024-07-21 02:53:22 +01:00
|
|
|
|
"OpenStreetMap Authentication Proxy",
|
2024-09-19 13:08:27 +01:00
|
|
|
|
"Error 404",
|
|
|
|
|
"Under Construction",
|
|
|
|
|
"Page not found",
|
|
|
|
|
"Error 404: Page not found",
|
|
|
|
|
"Barcamptools",
|
|
|
|
|
"That page can’t be found.",
|
|
|
|
|
"looks like there's no page here",
|
|
|
|
|
"404 page",
|
|
|
|
|
"Database Error",
|
2024-10-13 16:27:01 +01:00
|
|
|
|
"You are not authorized to access this page",
|
|
|
|
|
"Attention Required! | Cloudflare",
|
2024-10-23 12:07:41 +01:00
|
|
|
|
"This page doesn't currently exist",
|
2024-11-01 09:23:18 +00:00
|
|
|
|
"ERROR 503 - Service Unavailable",
|
|
|
|
|
"ERROR 503",
|
2023-09-06 07:57:12 +01:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_not_here_message(html: str) -> str | None:
|
|
|
|
|
"""Find not here message in web page."""
|
|
|
|
|
return next((not_here for not_here in not_here_list if not_here in html), None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_title(html: str) -> str:
|
|
|
|
|
"""Title from web page."""
|
|
|
|
|
m = re_title.search(html)
|
2023-09-11 07:40:16 +01:00
|
|
|
|
return m.group(1).strip() if m and m.group(1) else "no title"
|
2023-09-06 07:57:12 +01:00
|
|
|
|
|
|
|
|
|
|
2024-10-13 16:27:38 +01:00
|
|
|
|
def normalize_url(url: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Normalize the URL by parsing and reconstructing to ensure uniformity.
|
|
|
|
|
|
|
|
|
|
This handles cases like differing schemes, casing in the domain
|
|
|
|
|
and trailing slashes.
|
|
|
|
|
"""
|
|
|
|
|
# Parse the URL into components
|
|
|
|
|
parsed_url = urlparse(url)
|
|
|
|
|
# Normalize the domain to lowercase and remove any trailing dot
|
|
|
|
|
normalized_netloc = parsed_url.netloc.lower().rstrip(".")
|
|
|
|
|
# Reconstruct the URL with normalized components
|
|
|
|
|
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
|
|
|
|
|
|
|
|
|
|
|
2025-01-11 16:35:20 +00:00
|
|
|
|
def url_to_filename(url: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Convert a URL to a valid filename by replacing invalid characters with underscores.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
url (str): The URL to be converted.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: A valid filename.
|
|
|
|
|
"""
|
|
|
|
|
# Replace invalid characters with underscores
|
|
|
|
|
return re.sub(r'[\/:*?"<>|]', "_", url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_opengraph_tags(html: str) -> dict[str, str]:
|
|
|
|
|
"""
|
|
|
|
|
Locate Open Graph meta tags, and return them as a dictionary.
|
|
|
|
|
|
|
|
|
|
Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
|
|
|
|
|
"""
|
|
|
|
|
pattern = re.compile(
|
|
|
|
|
r'<meta\s+property="og:([^"]+)"\s+content="([^"]+)"\s*/?>', flags=re.IGNORECASE
|
|
|
|
|
)
|
|
|
|
|
matches = pattern.findall(html)
|
|
|
|
|
|
|
|
|
|
og_tags = {}
|
|
|
|
|
for prop, content in matches:
|
|
|
|
|
og_tags[prop] = content
|
|
|
|
|
|
|
|
|
|
return og_tags
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Conference:
|
|
|
|
|
"""Conference."""
|
|
|
|
|
|
|
|
|
|
name: str
|
|
|
|
|
src_url: str
|
|
|
|
|
year: int
|
|
|
|
|
response: None | requests.models.Response
|
|
|
|
|
|
|
|
|
|
def __init__(self, name: str, src_url: str, year: int):
|
|
|
|
|
"""Init."""
|
|
|
|
|
self.name = name
|
|
|
|
|
self.src_url = src_url
|
|
|
|
|
self.year = year
|
|
|
|
|
self.response = None
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def url(self) -> str:
|
|
|
|
|
"""Conference URL."""
|
|
|
|
|
return self.src_url.format(year=self.year)
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def past_url(self) -> str:
|
|
|
|
|
"""URL for previous year."""
|
|
|
|
|
return self.src_url.format(year=self.year - 1)
|
|
|
|
|
|
|
|
|
|
def check(self) -> tuple[bool, str, str | None]:
|
|
|
|
|
"""Check if conference is live."""
|
|
|
|
|
try:
|
|
|
|
|
# SotM Baltics has an invalid TLS certificate, but we don't care
|
|
|
|
|
r = s.get(self.url, verify=False)
|
|
|
|
|
except requests.exceptions.ConnectionError:
|
|
|
|
|
return (False, "connection refused", None)
|
|
|
|
|
|
|
|
|
|
self.response = r
|
|
|
|
|
|
|
|
|
|
filename = url_to_filename(self.url)
|
|
|
|
|
with open(os.path.join("sites", filename + ".html"), "w") as out:
|
|
|
|
|
out.write(r.text)
|
|
|
|
|
|
|
|
|
|
not_here = find_not_here_message(r.text)
|
|
|
|
|
if (
|
|
|
|
|
len(r.text) < 2048
|
|
|
|
|
and 'http-equiv="refresh"' in r.text
|
|
|
|
|
and str(self.year) not in r.text
|
|
|
|
|
):
|
|
|
|
|
return (False, "redirect to URL without year", r.url)
|
|
|
|
|
|
|
|
|
|
if normalize_url(r.url) == normalize_url(self.past_url):
|
|
|
|
|
return (False, "redirect to previous year", r.url)
|
|
|
|
|
|
|
|
|
|
if not_here:
|
|
|
|
|
return (False, not_here, r.url)
|
|
|
|
|
|
|
|
|
|
return (True, get_title(r.text), r.url)
|
|
|
|
|
|
|
|
|
|
def og_tags(self) -> dict[str, str]:
|
|
|
|
|
"""Open Graph tags."""
|
|
|
|
|
assert self.response
|
|
|
|
|
return parse_opengraph_tags(self.response.text)
|
|
|
|
|
|
|
|
|
|
def check_web_site(self) -> bool:
|
|
|
|
|
"""Check if an individual web site is live."""
|
|
|
|
|
assert "{year}" in self.src_url
|
|
|
|
|
live, msg, redirect_to_url = self.check()
|
|
|
|
|
if not live:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
|
|
|
|
|
|
|
|
|
|
if og:
|
|
|
|
|
og = "\n\nOpen Graph\n\n" + og
|
|
|
|
|
|
|
|
|
|
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
|
|
|
|
|
self.url
|
|
|
|
|
):
|
|
|
|
|
body = f"{self.name}\n{self.url}\n"
|
|
|
|
|
else:
|
|
|
|
|
body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
|
|
|
|
|
|
|
|
|
|
body += "Web page title: {msg}{og}" ""
|
|
|
|
|
send_mail(f"Conference site live: {self.name} - {self.year}", body)
|
|
|
|
|
|
|
|
|
|
return True
|
2023-09-06 07:57:12 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def send_mail(subject: str, body: str) -> None:
|
|
|
|
|
"""Send an e-mail."""
|
2025-01-11 16:35:20 +00:00
|
|
|
|
return
|
2024-02-25 17:20:26 +00:00
|
|
|
|
mail_from_address = config["mail"]["from_address"]
|
|
|
|
|
mail_from_name = config["mail"]["from_name"]
|
|
|
|
|
mail_to_address = config["mail"]["to_address"]
|
|
|
|
|
mail_to_name = config["mail"]["to_name"]
|
2023-09-06 07:57:12 +01:00
|
|
|
|
msg = MIMEText(body, "plain", "UTF-8")
|
|
|
|
|
|
|
|
|
|
msg["Subject"] = subject
|
2024-02-25 17:20:26 +00:00
|
|
|
|
msg["To"] = f"{mail_to_name} <{mail_to_address}>"
|
|
|
|
|
msg["From"] = f"{mail_from_name} <{mail_from_address}>"
|
2023-09-06 07:57:12 +01:00
|
|
|
|
msg["Date"] = formatdate()
|
|
|
|
|
msg["Message-ID"] = make_msgid()
|
|
|
|
|
|
2024-02-11 06:56:57 +00:00
|
|
|
|
# extra mail headers from config
|
|
|
|
|
for header_name, value in config["mail_headers"].items():
|
|
|
|
|
msg[header_name] = value
|
|
|
|
|
|
2024-02-25 17:23:38 +00:00
|
|
|
|
s = smtplib.SMTP(config["mail"]["smtp_host"])
|
|
|
|
|
s.sendmail(mail_from_address, [mail_to_address], msg.as_string())
|
2023-09-06 07:57:12 +01:00
|
|
|
|
s.quit()
|
|
|
|
|
|
|
|
|
|
|
2025-01-11 16:35:20 +00:00
|
|
|
|
def check(name: str, src_url: str, year: int) -> bool:
|
|
|
|
|
"""Check to see if conference site is live."""
|
|
|
|
|
conf = Conference(name, src_url, year)
|
|
|
|
|
return conf.check_web_site()
|
2024-03-22 09:31:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_new_conference_web_sites(
|
|
|
|
|
today: date, live: list[LiveConference]
|
|
|
|
|
) -> list[LiveConference]:
|
|
|
|
|
"""Check for new conference web sites going live."""
|
2024-02-25 17:23:38 +00:00
|
|
|
|
this_year = today.year
|
|
|
|
|
|
2024-03-22 09:31:21 +00:00
|
|
|
|
new: list[LiveConference] = []
|
|
|
|
|
|
|
|
|
|
live_set = {(c["conference"], c["year"]) for c in live}
|
|
|
|
|
for name, src_url in load_yaml("conferences").items():
|
|
|
|
|
new += [
|
|
|
|
|
{"conference": name, "year": year, "live": today}
|
|
|
|
|
for year in (this_year, this_year + 1)
|
2025-01-11 16:35:20 +00:00
|
|
|
|
if (name, year) not in live_set and check(name, src_url, year)
|
2024-03-22 09:31:21 +00:00
|
|
|
|
]
|
|
|
|
|
return new
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(show_not_live: bool = False) -> None:
|
|
|
|
|
"""Check fow new conference web sites."""
|
|
|
|
|
live: list[LiveConference] = load_yaml("live")
|
|
|
|
|
if not (new := find_new_conference_web_sites(date.today(), live)):
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
live_filename = os.path.expanduser(config["data"]["live"])
|
|
|
|
|
with open(live_filename, "w") as out:
|
|
|
|
|
yaml.dump(live + new, stream=out, sort_keys=False)
|
2023-09-06 07:57:12 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|