From 35c213110d43cb4cffccad5dfd0608d8b84a64b8 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 1 Nov 2024 09:23:18 +0000 Subject: [PATCH 01/13] Add more strings to not_here_list. --- check.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/check.py b/check.py index 81e028f..af14bf4 100755 --- a/check.py +++ b/check.py @@ -93,6 +93,8 @@ not_here_list = [ "You are not authorized to access this page", "Attention Required! | Cloudflare", "This page doesn't currently exist", + "ERROR 503 - Service Unavailable", + "ERROR 503", ] From a523af16e196e59bfa0318d196a69af6f6db342b Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 11 Jan 2025 16:35:20 +0000 Subject: [PATCH 02/13] Open Graph meta tags in alert e-mail Closes: #7 --- check.py | 156 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 118 insertions(+), 38 deletions(-) diff --git a/check.py b/check.py index 81e028f..39043d6 100755 --- a/check.py +++ b/check.py @@ -122,34 +122,126 @@ def normalize_url(url: str) -> str: return urlunparse(parsed_url._replace(netloc=normalized_netloc)) -def check_conference( - name: str, src_url: str, year: int -) -> tuple[bool, str, str | None]: - """Check if conference is live.""" - url = src_url.format(year=year) - past_url = src_url.format(year=year - 1) - try: - # SotM Baltics has an invalid TLS certificate, but we don't care - r = s.get(url, verify=False) - except requests.exceptions.ConnectionError: - return (False, "connection refused", None) +def url_to_filename(url: str) -> str: + """ + Convert a URL to a valid filename by replacing invalid characters with underscores. - not_here = find_not_here_message(r.text) - if ( - len(r.text) < 2048 - and 'http-equiv="refresh"' in r.text - and str(year) not in r.text - ): - return (False, "redirect to URL without year", r.url) + Args: + url (str): The URL to be converted. - if normalize_url(r.url) == normalize_url(past_url): - return (False, "redirect to previous year", r.url) + Returns: + str: A valid filename. + """ + # Replace invalid characters with underscores + return re.sub(r'[\/:*?"<>|]', "_", url) - return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url) + +def parse_opengraph_tags(html: str) -> dict[str, str]: + """ + Locate Open Graph meta tags, and return them as a dictionary. + + Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'. + """ + pattern = re.compile( + r'', flags=re.IGNORECASE + ) + matches = pattern.findall(html) + + og_tags = {} + for prop, content in matches: + og_tags[prop] = content + + return og_tags + + +class Conference: + """Conference.""" + + name: str + src_url: str + year: int + response: None | requests.models.Response + + def __init__(self, name: str, src_url: str, year: int): + """Init.""" + self.name = name + self.src_url = src_url + self.year = year + self.response = None + + @property + def url(self) -> str: + """Conference URL.""" + return self.src_url.format(year=self.year) + + @property + def past_url(self) -> str: + """URL for previous year.""" + return self.src_url.format(year=self.year - 1) + + def check(self) -> tuple[bool, str, str | None]: + """Check if conference is live.""" + try: + # SotM Baltics has an invalid TLS certificate, but we don't care + r = s.get(self.url, verify=False) + except requests.exceptions.ConnectionError: + return (False, "connection refused", None) + + self.response = r + + filename = url_to_filename(self.url) + with open(os.path.join("sites", filename + ".html"), "w") as out: + out.write(r.text) + + not_here = find_not_here_message(r.text) + if ( + len(r.text) < 2048 + and 'http-equiv="refresh"' in r.text + and str(self.year) not in r.text + ): + return (False, "redirect to URL without year", r.url) + + if normalize_url(r.url) == normalize_url(self.past_url): + return (False, "redirect to previous year", r.url) + + if not_here: + return (False, not_here, r.url) + + return (True, get_title(r.text), r.url) + + def og_tags(self) -> dict[str, str]: + """Open Graph tags.""" + assert self.response + return parse_opengraph_tags(self.response.text) + + def check_web_site(self) -> bool: + """Check if an individual web site is live.""" + assert "{year}" in self.src_url + live, msg, redirect_to_url = self.check() + if not live: + return False + + og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items()) + + if og: + og = "\n\nOpen Graph\n\n" + og + + if redirect_to_url and normalize_url(redirect_to_url) == normalize_url( + self.url + ): + body = f"{self.name}\n{self.url}\n" + else: + body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n" + + body += "Web page title: {msg}{og}" "" + send_mail(f"Conference site live: {self.name} - {self.year}", body) + + return True def send_mail(subject: str, body: str) -> None: """Send an e-mail.""" + return mail_from_address = config["mail"]["from_address"] mail_from_name = config["mail"]["from_name"] mail_to_address = config["mail"]["to_address"] @@ -171,21 +263,10 @@ def send_mail(subject: str, body: str) -> None: s.quit() -def check_conference_web_site(name: str, src_url: str, year: int) -> bool: - """Check if an individual web site is live.""" - assert "{year}" in src_url - live, msg, redirect_to_url = check_conference(name, src_url, year) - url = src_url.format(year=year) - if live: - if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url): - body = f"{name}\n{url}\nWeb page title: {msg}" - else: - body = f"""{name} -{url} redirects to {redirect_to_url} -Web page title: {msg}""" - send_mail(f"Conference site live: {name} - {year}", body) - - return live +def check(name: str, src_url: str, year: int) -> bool: + """Check to see if conference site is live.""" + conf = Conference(name, src_url, year) + return conf.check_web_site() def find_new_conference_web_sites( @@ -201,8 +282,7 @@ def find_new_conference_web_sites( new += [ {"conference": name, "year": year, "live": today} for year in (this_year, this_year + 1) - if (name, year) not in live_set - and check_conference_web_site(name, src_url, year) + if (name, year) not in live_set and check(name, src_url, year) ] return new From fee54a37e7407e75e8f482b720a109440033ab90 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 11 Jan 2025 16:43:51 +0000 Subject: [PATCH 03/13] User cloudscraper --- check.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/check.py b/check.py index af14bf4..5d3bb74 100755 --- a/check.py +++ b/check.py @@ -2,6 +2,7 @@ """Check if conference websites are live.""" +import html import os import re import smtplib @@ -11,6 +12,7 @@ from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from urllib.parse import urlparse, urlunparse +import cloudscraper import requests import yaml from requests.adapters import HTTPAdapter @@ -53,8 +55,14 @@ re_title = re.compile("(.*?)", re.DOTALL) AGENT = config["browser"]["User-Agent"] headers = {"User-Agent": AGENT, "Accept": "text/html"} -s = requests.Session() -s.headers.update(headers) +s = cloudscraper.CloudScraper() +s_no_dot = cloudscraper.CloudScraper() + +# s = requests.Session() +# s.headers.update(headers) + +# s_no_dot = requests.Session() +# s_no_dot.headers.update(headers) # Create a session and mount the custom adapter for both HTTP and HTTPS requests. adapter = AbsoluteDNSAdapter() @@ -82,7 +90,7 @@ not_here_list = [ "Ooops! Could Not Find It", "OpenStreetMap Authentication Proxy", "Error 404", - "Under Construction", + # "Under Construction", "Page not found", "Error 404: Page not found", "Barcamptools", @@ -95,18 +103,21 @@ not_here_list = [ "This page doesn't currently exist", "ERROR 503 - Service Unavailable", "ERROR 503", + "401 Authorization Required", + "Authorization Required", + "used Cloudflare to restrict access", ] -def find_not_here_message(html: str) -> str | None: +def find_not_here_message(page_html: str) -> str | None: """Find not here message in web page.""" - return next((not_here for not_here in not_here_list if not_here in html), None) + return next((not_here for not_here in not_here_list if not_here in page_html), None) -def get_title(html: str) -> str: +def get_title(page_html: str) -> str: """Title from web page.""" - m = re_title.search(html) - return m.group(1).strip() if m and m.group(1) else "no title" + m = re_title.search(page_html) + return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title" def normalize_url(url: str) -> str: @@ -130,9 +141,10 @@ def check_conference( """Check if conference is live.""" url = src_url.format(year=year) past_url = src_url.format(year=year - 1) + no_dot = {"bsideskbh.dk", "pif.camp"} + session = s if all(hostname not in url for hostname in no_dot) else s_no_dot try: - # SotM Baltics has an invalid TLS certificate, but we don't care - r = s.get(url, verify=False) + r = session.get(url) except requests.exceptions.ConnectionError: return (False, "connection refused", None) From 8c0acb1453ca4491ff220cd83e150c9246dfbe2b Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 1 Nov 2024 09:23:18 +0000 Subject: [PATCH 04/13] Add more strings to not_here_list. --- check.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/check.py b/check.py index 39043d6..c0827ea 100755 --- a/check.py +++ b/check.py @@ -93,6 +93,8 @@ not_here_list = [ "You are not authorized to access this page", "Attention Required! | Cloudflare", "This page doesn't currently exist", + "ERROR 503 - Service Unavailable", + "ERROR 503", ] From d845ec805f9b2a9d22b99a7f24a8ac88671e026c Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 11 Jan 2025 16:43:51 +0000 Subject: [PATCH 05/13] Use cloudscraper --- check.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/check.py b/check.py index c0827ea..5d4c234 100755 --- a/check.py +++ b/check.py @@ -2,6 +2,7 @@ """Check if conference websites are live.""" +import html import os import re import smtplib @@ -11,6 +12,7 @@ from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from urllib.parse import urlparse, urlunparse +import cloudscraper import requests import yaml from requests.adapters import HTTPAdapter @@ -53,8 +55,14 @@ re_title = re.compile("(.*?)", re.DOTALL) AGENT = config["browser"]["User-Agent"] headers = {"User-Agent": AGENT, "Accept": "text/html"} -s = requests.Session() -s.headers.update(headers) +s = cloudscraper.CloudScraper() +s_no_dot = cloudscraper.CloudScraper() + +# s = requests.Session() +# s.headers.update(headers) + +# s_no_dot = requests.Session() +# s_no_dot.headers.update(headers) # Create a session and mount the custom adapter for both HTTP and HTTPS requests. adapter = AbsoluteDNSAdapter() @@ -82,7 +90,7 @@ not_here_list = [ "Ooops! Could Not Find It", "OpenStreetMap Authentication Proxy", "Error 404", - "Under Construction", + # "Under Construction", "Page not found", "Error 404: Page not found", "Barcamptools", @@ -95,18 +103,21 @@ not_here_list = [ "This page doesn't currently exist", "ERROR 503 - Service Unavailable", "ERROR 503", + "401 Authorization Required", + "Authorization Required", + "used Cloudflare to restrict access", ] -def find_not_here_message(html: str) -> str | None: +def find_not_here_message(page_html: str) -> str | None: """Find not here message in web page.""" - return next((not_here for not_here in not_here_list if not_here in html), None) + return next((not_here for not_here in not_here_list if not_here in page_html), None) -def get_title(html: str) -> str: +def get_title(page_html: str) -> str: """Title from web page.""" - m = re_title.search(html) - return m.group(1).strip() if m and m.group(1) else "no title" + m = re_title.search(page_html) + return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title" def normalize_url(url: str) -> str: @@ -183,18 +194,17 @@ class Conference: def check(self) -> tuple[bool, str, str | None]: """Check if conference is live.""" + no_dot = {"bsideskbh.dk", "pif.camp"} + session = ( + s if all(hostname not in self.url for hostname in no_dot) else s_no_dot + ) try: - # SotM Baltics has an invalid TLS certificate, but we don't care - r = s.get(self.url, verify=False) + r = session.get(self.url) except requests.exceptions.ConnectionError: return (False, "connection refused", None) self.response = r - filename = url_to_filename(self.url) - with open(os.path.join("sites", filename + ".html"), "w") as out: - out.write(r.text) - not_here = find_not_here_message(r.text) if ( len(r.text) < 2048 From 56645e3ff853ed94a76858d22551d3b9765b9e99 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Tue, 18 Feb 2025 00:48:32 +0000 Subject: [PATCH 06/13] Add '500: Internal Server Error' to not_here_list. --- check.py | 1 + 1 file changed, 1 insertion(+) diff --git a/check.py b/check.py index 5d4c234..ae1bd68 100755 --- a/check.py +++ b/check.py @@ -76,6 +76,7 @@ not_here_list = [ "This page does not exist yet", "404 Not Found", "500 Internal Server Error", + "500: Internal Server Error", "Test Page for the Apache HTTP Server", "Site not found · GitHub Pages", "504: Gateway time-out", From befdfd8bb8b0757b638882fe31070e0b3097f433 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Tue, 18 Feb 2025 00:49:05 +0000 Subject: [PATCH 07/13] New build_url() method. --- check.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/check.py b/check.py index ae1bd68..7630216 100755 --- a/check.py +++ b/check.py @@ -186,21 +186,24 @@ class Conference: @property def url(self) -> str: """Conference URL.""" - return self.src_url.format(year=self.year) + return self.build_url(year=self.year) + + def build_url(self, year: int) -> str: + """Build conference URL.""" + return self.src_url.format(year=year, two_digit_year=year % 2000) @property def past_url(self) -> str: """URL for previous year.""" - return self.src_url.format(year=self.year - 1) + return self.build_url(year=self.year - 1) def check(self) -> tuple[bool, str, str | None]: """Check if conference is live.""" no_dot = {"bsideskbh.dk", "pif.camp"} - session = ( - s if all(hostname not in self.url for hostname in no_dot) else s_no_dot - ) + url = self.url + session = s if all(hostname not in url for hostname in no_dot) else s_no_dot try: - r = session.get(self.url) + r = session.get(url) except requests.exceptions.ConnectionError: return (False, "connection refused", None) @@ -229,7 +232,7 @@ class Conference: def check_web_site(self) -> bool: """Check if an individual web site is live.""" - assert "{year}" in self.src_url + assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url live, msg, redirect_to_url = self.check() if not live: return False From ee6f594a1942a3cb09363847a68f60ce97de7b46 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 20 Apr 2025 10:47:04 +0100 Subject: [PATCH 08/13] Bug fix no response from website. --- check.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/check.py b/check.py index 7630216..e3a0e28 100755 --- a/check.py +++ b/check.py @@ -227,8 +227,7 @@ class Conference: def og_tags(self) -> dict[str, str]: """Open Graph tags.""" - assert self.response - return parse_opengraph_tags(self.response.text) + return parse_opengraph_tags(self.response.text) if self.response else {} def check_web_site(self) -> bool: """Check if an individual web site is live.""" From 347837f4fef4cc92127a9e3209972f513babb69f Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 4 Aug 2025 19:37:29 +0100 Subject: [PATCH 09/13] Add cloudscraper to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f229360..f7b8e8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ requests +cloudscraper From bb79b5ac9976f91894a4ebbbdd72f30a8ec0005f Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 4 Aug 2025 19:38:59 +0100 Subject: [PATCH 10/13] Ignore missing types for cloudscraper --- check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check.py b/check.py index e3a0e28..1eb7c4e 100755 --- a/check.py +++ b/check.py @@ -12,7 +12,7 @@ from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from urllib.parse import urlparse, urlunparse -import cloudscraper +import cloudscraper # type: ignore import requests import yaml from requests.adapters import HTTPAdapter From 17bd1ae9bc8a7193e79ec14331dd83e593d27b55 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 4 Aug 2025 19:42:42 +0100 Subject: [PATCH 11/13] Send mail --- check.py | 1 - 1 file changed, 1 deletion(-) diff --git a/check.py b/check.py index 1eb7c4e..16084c2 100755 --- a/check.py +++ b/check.py @@ -256,7 +256,6 @@ class Conference: def send_mail(subject: str, body: str) -> None: """Send an e-mail.""" - return mail_from_address = config["mail"]["from_address"] mail_from_name = config["mail"]["from_name"] mail_to_address = config["mail"]["to_address"] From aa22d7669996abd575b39656ade8cf3fc33e2072 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 4 Aug 2025 20:28:54 +0100 Subject: [PATCH 12/13] Bug fix web page title in email --- check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check.py b/check.py index 16084c2..40d6575 100755 --- a/check.py +++ b/check.py @@ -248,7 +248,7 @@ class Conference: else: body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n" - body += "Web page title: {msg}{og}" "" + body += f"Web page title: {msg}{og}" "" send_mail(f"Conference site live: {self.name} - {self.year}", body) return True From ff0d81f32c66fb8d4a719f14f99c0eb7ea5acb47 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 4 Aug 2025 20:44:30 +0100 Subject: [PATCH 13/13] Include URL in live.yaml --- check.py | 39 ++++++++++++++++++++------------------- conference/__init__.py | 1 + 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/check.py b/check.py index 40d6575..b4b6383 100755 --- a/check.py +++ b/check.py @@ -7,6 +7,7 @@ import os import re import smtplib import warnings +from dataclasses import dataclass from datetime import date from email.mime.text import MIMEText from email.utils import formatdate, make_msgid @@ -168,20 +169,15 @@ def parse_opengraph_tags(html: str) -> dict[str, str]: return og_tags +@dataclass class Conference: """Conference.""" name: str src_url: str year: int - response: None | requests.models.Response - - def __init__(self, name: str, src_url: str, year: int): - """Init.""" - self.name = name - self.src_url = src_url - self.year = year - self.response = None + response: requests.models.Response | None = None + redirect_to_url: str | None = None @property def url(self) -> str: @@ -246,6 +242,7 @@ class Conference: ): body = f"{self.name}\n{self.url}\n" else: + self.redirect_to_url = redirect_to_url body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n" body += f"Web page title: {msg}{og}" "" @@ -277,12 +274,6 @@ def send_mail(subject: str, body: str) -> None: s.quit() -def check(name: str, src_url: str, year: int) -> bool: - """Check to see if conference site is live.""" - conf = Conference(name, src_url, year) - return conf.check_web_site() - - def find_new_conference_web_sites( today: date, live: list[LiveConference] ) -> list[LiveConference]: @@ -293,11 +284,21 @@ def find_new_conference_web_sites( live_set = {(c["conference"], c["year"]) for c in live} for name, src_url in load_yaml("conferences").items(): - new += [ - {"conference": name, "year": year, "live": today} - for year in (this_year, this_year + 1) - if (name, year) not in live_set and check(name, src_url, year) - ] + for year in (this_year, this_year + 1): + if (name, year) in live_set: + continue + conf = Conference(name, src_url, year) + if not conf.check_web_site(): + continue + c: LiveConference = { + "conference": name, + "year": year, + "live": today, + "url": conf.url, + } + if conf.redirect_to_url: + c["redirect_to_url"] = conf.redirect_to_url + new.append(c) return new diff --git a/conference/__init__.py b/conference/__init__.py index 415e3d2..e1d57a5 100644 --- a/conference/__init__.py +++ b/conference/__init__.py @@ -28,6 +28,7 @@ class LiveConference(typing.TypedDict, total=False): year: int live: date url: str | None + redirect_to_url: str | None def load_yaml(name: str) -> typing.Any: