diff --git a/check.py b/check.py index b4b6383..5d3bb74 100755 --- a/check.py +++ b/check.py @@ -7,13 +7,12 @@ import os import re import smtplib import warnings -from dataclasses import dataclass from datetime import date from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from urllib.parse import urlparse, urlunparse -import cloudscraper # type: ignore +import cloudscraper import requests import yaml from requests.adapters import HTTPAdapter @@ -77,7 +76,6 @@ not_here_list = [ "This page does not exist yet", "404 Not Found", "500 Internal Server Error", - "500: Internal Server Error", "Test Page for the Apache HTTP Server", "Site not found · GitHub Pages", "504: Gateway time-out", @@ -137,118 +135,31 @@ def normalize_url(url: str) -> str: return urlunparse(parsed_url._replace(netloc=normalized_netloc)) -def url_to_filename(url: str) -> str: - """ - Convert a URL to a valid filename by replacing invalid characters with underscores. +def check_conference( + name: str, src_url: str, year: int +) -> tuple[bool, str, str | None]: + """Check if conference is live.""" + url = src_url.format(year=year) + past_url = src_url.format(year=year - 1) + no_dot = {"bsideskbh.dk", "pif.camp"} + session = s if all(hostname not in url for hostname in no_dot) else s_no_dot + try: + r = session.get(url) + except requests.exceptions.ConnectionError: + return (False, "connection refused", None) - Args: - url (str): The URL to be converted. + not_here = find_not_here_message(r.text) + if ( + len(r.text) < 2048 + and 'http-equiv="refresh"' in r.text + and str(year) not in r.text + ): + return (False, "redirect to URL without year", r.url) - Returns: - str: A valid filename. - """ - # Replace invalid characters with underscores - return re.sub(r'[\/:*?"<>|]', "_", url) + if normalize_url(r.url) == normalize_url(past_url): + return (False, "redirect to previous year", r.url) - -def parse_opengraph_tags(html: str) -> dict[str, str]: - """ - Locate Open Graph meta tags, and return them as a dictionary. - - Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'. - """ - pattern = re.compile( - r'', flags=re.IGNORECASE - ) - matches = pattern.findall(html) - - og_tags = {} - for prop, content in matches: - og_tags[prop] = content - - return og_tags - - -@dataclass -class Conference: - """Conference.""" - - name: str - src_url: str - year: int - response: requests.models.Response | None = None - redirect_to_url: str | None = None - - @property - def url(self) -> str: - """Conference URL.""" - return self.build_url(year=self.year) - - def build_url(self, year: int) -> str: - """Build conference URL.""" - return self.src_url.format(year=year, two_digit_year=year % 2000) - - @property - def past_url(self) -> str: - """URL for previous year.""" - return self.build_url(year=self.year - 1) - - def check(self) -> tuple[bool, str, str | None]: - """Check if conference is live.""" - no_dot = {"bsideskbh.dk", "pif.camp"} - url = self.url - session = s if all(hostname not in url for hostname in no_dot) else s_no_dot - try: - r = session.get(url) - except requests.exceptions.ConnectionError: - return (False, "connection refused", None) - - self.response = r - - not_here = find_not_here_message(r.text) - if ( - len(r.text) < 2048 - and 'http-equiv="refresh"' in r.text - and str(self.year) not in r.text - ): - return (False, "redirect to URL without year", r.url) - - if normalize_url(r.url) == normalize_url(self.past_url): - return (False, "redirect to previous year", r.url) - - if not_here: - return (False, not_here, r.url) - - return (True, get_title(r.text), r.url) - - def og_tags(self) -> dict[str, str]: - """Open Graph tags.""" - return parse_opengraph_tags(self.response.text) if self.response else {} - - def check_web_site(self) -> bool: - """Check if an individual web site is live.""" - assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url - live, msg, redirect_to_url = self.check() - if not live: - return False - - og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items()) - - if og: - og = "\n\nOpen Graph\n\n" + og - - if redirect_to_url and normalize_url(redirect_to_url) == normalize_url( - self.url - ): - body = f"{self.name}\n{self.url}\n" - else: - self.redirect_to_url = redirect_to_url - body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n" - - body += f"Web page title: {msg}{og}" "" - send_mail(f"Conference site live: {self.name} - {self.year}", body) - - return True + return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url) def send_mail(subject: str, body: str) -> None: @@ -274,6 +185,23 @@ def send_mail(subject: str, body: str) -> None: s.quit() +def check_conference_web_site(name: str, src_url: str, year: int) -> bool: + """Check if an individual web site is live.""" + assert "{year}" in src_url + live, msg, redirect_to_url = check_conference(name, src_url, year) + url = src_url.format(year=year) + if live: + if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url): + body = f"{name}\n{url}\nWeb page title: {msg}" + else: + body = f"""{name} +{url} redirects to {redirect_to_url} +Web page title: {msg}""" + send_mail(f"Conference site live: {name} - {year}", body) + + return live + + def find_new_conference_web_sites( today: date, live: list[LiveConference] ) -> list[LiveConference]: @@ -284,21 +212,12 @@ def find_new_conference_web_sites( live_set = {(c["conference"], c["year"]) for c in live} for name, src_url in load_yaml("conferences").items(): - for year in (this_year, this_year + 1): - if (name, year) in live_set: - continue - conf = Conference(name, src_url, year) - if not conf.check_web_site(): - continue - c: LiveConference = { - "conference": name, - "year": year, - "live": today, - "url": conf.url, - } - if conf.redirect_to_url: - c["redirect_to_url"] = conf.redirect_to_url - new.append(c) + new += [ + {"conference": name, "year": year, "live": today} + for year in (this_year, this_year + 1) + if (name, year) not in live_set + and check_conference_web_site(name, src_url, year) + ] return new diff --git a/conference/__init__.py b/conference/__init__.py index e1d57a5..415e3d2 100644 --- a/conference/__init__.py +++ b/conference/__init__.py @@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False): year: int live: date url: str | None - redirect_to_url: str | None def load_yaml(name: str) -> typing.Any: diff --git a/requirements.txt b/requirements.txt index f7b8e8f..f229360 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ requests -cloudscraper