From d845ec805f9b2a9d22b99a7f24a8ac88671e026c Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sat, 11 Jan 2025 16:43:51 +0000 Subject: [PATCH] Use cloudscraper --- check.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/check.py b/check.py index c0827ea..5d4c234 100755 --- a/check.py +++ b/check.py @@ -2,6 +2,7 @@ """Check if conference websites are live.""" +import html import os import re import smtplib @@ -11,6 +12,7 @@ from email.mime.text import MIMEText from email.utils import formatdate, make_msgid from urllib.parse import urlparse, urlunparse +import cloudscraper import requests import yaml from requests.adapters import HTTPAdapter @@ -53,8 +55,14 @@ re_title = re.compile("(.*?)", re.DOTALL) AGENT = config["browser"]["User-Agent"] headers = {"User-Agent": AGENT, "Accept": "text/html"} -s = requests.Session() -s.headers.update(headers) +s = cloudscraper.CloudScraper() +s_no_dot = cloudscraper.CloudScraper() + +# s = requests.Session() +# s.headers.update(headers) + +# s_no_dot = requests.Session() +# s_no_dot.headers.update(headers) # Create a session and mount the custom adapter for both HTTP and HTTPS requests. adapter = AbsoluteDNSAdapter() @@ -82,7 +90,7 @@ not_here_list = [ "Ooops! Could Not Find It", "OpenStreetMap Authentication Proxy", "Error 404", - "Under Construction", + # "Under Construction", "Page not found", "Error 404: Page not found", "Barcamptools", @@ -95,18 +103,21 @@ not_here_list = [ "This page doesn't currently exist", "ERROR 503 - Service Unavailable", "ERROR 503", + "401 Authorization Required", + "Authorization Required", + "used Cloudflare to restrict access", ] -def find_not_here_message(html: str) -> str | None: +def find_not_here_message(page_html: str) -> str | None: """Find not here message in web page.""" - return next((not_here for not_here in not_here_list if not_here in html), None) + return next((not_here for not_here in not_here_list if not_here in page_html), None) -def get_title(html: str) -> str: +def get_title(page_html: str) -> str: """Title from web page.""" - m = re_title.search(html) - return m.group(1).strip() if m and m.group(1) else "no title" + m = re_title.search(page_html) + return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title" def normalize_url(url: str) -> str: @@ -183,18 +194,17 @@ class Conference: def check(self) -> tuple[bool, str, str | None]: """Check if conference is live.""" + no_dot = {"bsideskbh.dk", "pif.camp"} + session = ( + s if all(hostname not in self.url for hostname in no_dot) else s_no_dot + ) try: - # SotM Baltics has an invalid TLS certificate, but we don't care - r = s.get(self.url, verify=False) + r = session.get(self.url) except requests.exceptions.ConnectionError: return (False, "connection refused", None) self.response = r - filename = url_to_filename(self.url) - with open(os.path.join("sites", filename + ".html"), "w") as out: - out.write(r.text) - not_here = find_not_here_message(r.text) if ( len(r.text) < 2048