From a523af16e196e59bfa0318d196a69af6f6db342b Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 11 Jan 2025 16:35:20 +0000 Subject: [PATCH] Open Graph meta tags in alert e-mail Closes: #7 --- check.py | 156 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 118 insertions(+), 38 deletions(-) diff --git a/check.py b/check.py index 81e028f..39043d6 100755 --- a/check.py +++ b/check.py @@ -122,34 +122,126 @@ def normalize_url(url: str) -> str: return urlunparse(parsed_url._replace(netloc=normalized_netloc)) -def check_conference( - name: str, src_url: str, year: int -) -> tuple[bool, str, str | None]: - """Check if conference is live.""" - url = src_url.format(year=year) - past_url = src_url.format(year=year - 1) - try: - # SotM Baltics has an invalid TLS certificate, but we don't care - r = s.get(url, verify=False) - except requests.exceptions.ConnectionError: - return (False, "connection refused", None) +def url_to_filename(url: str) -> str: + """ + Convert a URL to a valid filename by replacing invalid characters with underscores. - not_here = find_not_here_message(r.text) - if ( - len(r.text) < 2048 - and 'http-equiv="refresh"' in r.text - and str(year) not in r.text - ): - return (False, "redirect to URL without year", r.url) + Args: + url (str): The URL to be converted. - if normalize_url(r.url) == normalize_url(past_url): - return (False, "redirect to previous year", r.url) + Returns: + str: A valid filename. + """ + # Replace invalid characters with underscores + return re.sub(r'[\/:*?"<>|]', "_", url) - return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url) + +def parse_opengraph_tags(html: str) -> dict[str, str]: + """ + Locate Open Graph meta tags, and return them as a dictionary. + + Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'. + """ + pattern = re.compile( + r'', flags=re.IGNORECASE + ) + matches = pattern.findall(html) + + og_tags = {} + for prop, content in matches: + og_tags[prop] = content + + return og_tags + + +class Conference: + """Conference.""" + + name: str + src_url: str + year: int + response: None | requests.models.Response + + def __init__(self, name: str, src_url: str, year: int): + """Init.""" + self.name = name + self.src_url = src_url + self.year = year + self.response = None + + @property + def url(self) -> str: + """Conference URL.""" + return self.src_url.format(year=self.year) + + @property + def past_url(self) -> str: + """URL for previous year.""" + return self.src_url.format(year=self.year - 1) + + def check(self) -> tuple[bool, str, str | None]: + """Check if conference is live.""" + try: + # SotM Baltics has an invalid TLS certificate, but we don't care + r = s.get(self.url, verify=False) + except requests.exceptions.ConnectionError: + return (False, "connection refused", None) + + self.response = r + + filename = url_to_filename(self.url) + with open(os.path.join("sites", filename + ".html"), "w") as out: + out.write(r.text) + + not_here = find_not_here_message(r.text) + if ( + len(r.text) < 2048 + and 'http-equiv="refresh"' in r.text + and str(self.year) not in r.text + ): + return (False, "redirect to URL without year", r.url) + + if normalize_url(r.url) == normalize_url(self.past_url): + return (False, "redirect to previous year", r.url) + + if not_here: + return (False, not_here, r.url) + + return (True, get_title(r.text), r.url) + + def og_tags(self) -> dict[str, str]: + """Open Graph tags.""" + assert self.response + return parse_opengraph_tags(self.response.text) + + def check_web_site(self) -> bool: + """Check if an individual web site is live.""" + assert "{year}" in self.src_url + live, msg, redirect_to_url = self.check() + if not live: + return False + + og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items()) + + if og: + og = "\n\nOpen Graph\n\n" + og + + if redirect_to_url and normalize_url(redirect_to_url) == normalize_url( + self.url + ): + body = f"{self.name}\n{self.url}\n" + else: + body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n" + + body += "Web page title: {msg}{og}" "" + send_mail(f"Conference site live: {self.name} - {self.year}", body) + + return True def send_mail(subject: str, body: str) -> None: """Send an e-mail.""" + return mail_from_address = config["mail"]["from_address"] mail_from_name = config["mail"]["from_name"] mail_to_address = config["mail"]["to_address"] @@ -171,21 +263,10 @@ def send_mail(subject: str, body: str) -> None: s.quit() -def check_conference_web_site(name: str, src_url: str, year: int) -> bool: - """Check if an individual web site is live.""" - assert "{year}" in src_url - live, msg, redirect_to_url = check_conference(name, src_url, year) - url = src_url.format(year=year) - if live: - if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url): - body = f"{name}\n{url}\nWeb page title: {msg}" - else: - body = f"""{name} -{url} redirects to {redirect_to_url} -Web page title: {msg}""" - send_mail(f"Conference site live: {name} - {year}", body) - - return live +def check(name: str, src_url: str, year: int) -> bool: + """Check to see if conference site is live.""" + conf = Conference(name, src_url, year) + return conf.check_web_site() def find_new_conference_web_sites( @@ -201,8 +282,7 @@ def find_new_conference_web_sites( new += [ {"conference": name, "year": year, "live": today} for year in (this_year, this_year + 1) - if (name, year) not in live_set - and check_conference_web_site(name, src_url, year) + if (name, year) not in live_set and check(name, src_url, year) ] return new