Use cloudscraper

2025-01-11 16:43:51 +00:00 · 2025-01-11 16:43:51 +00:00 · d845ec805f
commit d845ec805f
parent 8c0acb1453
1 changed files with 24 additions and 14 deletions
--- a/check.py
+++ b/check.py
@ -2,6 +2,7 @@

 """Check if conference websites are live."""

+import html
 import os
 import re
 import smtplib
@ -11,6 +12,7 @@ from email.mime.text import MIMEText
 from email.utils import formatdate, make_msgid
 from urllib.parse import urlparse, urlunparse

+import cloudscraper
 import requests
 import yaml
 from requests.adapters import HTTPAdapter
@ -53,8 +55,14 @@ re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
 AGENT = config["browser"]["User-Agent"]
 headers = {"User-Agent": AGENT, "Accept": "text/html"}

-s = requests.Session()
-s.headers.update(headers)
+s = cloudscraper.CloudScraper()
+s_no_dot = cloudscraper.CloudScraper()
+
+# s = requests.Session()
+# s.headers.update(headers)
+
+# s_no_dot = requests.Session()
+# s_no_dot.headers.update(headers)

 # Create a session and mount the custom adapter for both HTTP and HTTPS requests.
 adapter = AbsoluteDNSAdapter()
@ -82,7 +90,7 @@ not_here_list = [
    "Ooops! Could Not Find It",
    "OpenStreetMap Authentication Proxy",
    "Error 404",
-    "Under Construction",
+    # "Under Construction",
    "Page not found",
    "Error 404: Page not found",
    "Barcamptools",
@ -95,18 +103,21 @@ not_here_list = [
    "This page doesn't currently exist",
    "ERROR 503 - Service Unavailable",
    "ERROR 503",
+    "401 Authorization Required",
+    "Authorization Required",
+    "used Cloudflare to restrict access",
 ]


-def find_not_here_message(html: str) -> str | None:
+def find_not_here_message(page_html: str) -> str | None:
    """Find not here message in web page."""
-    return next((not_here for not_here in not_here_list if not_here in html), None)
+    return next((not_here for not_here in not_here_list if not_here in page_html), None)


-def get_title(html: str) -> str:
+def get_title(page_html: str) -> str:
    """Title from web page."""
-    m = re_title.search(html)
-    return m.group(1).strip() if m and m.group(1) else "no title"
+    m = re_title.search(page_html)
+    return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title"


 def normalize_url(url: str) -> str:
@ -183,18 +194,17 @@ class Conference:

    def check(self) -> tuple[bool, str, str | None]:
        """Check if conference is live."""
+        no_dot = {"bsideskbh.dk", "pif.camp"}
+        session = (
+            s if all(hostname not in self.url for hostname in no_dot) else s_no_dot
+        )
        try:
-            # SotM Baltics has an invalid TLS certificate, but we don't care
-            r = s.get(self.url, verify=False)
+            r = session.get(self.url)
        except requests.exceptions.ConnectionError:
            return (False, "connection refused", None)

        self.response = r

-        filename = url_to_filename(self.url)
-        with open(os.path.join("sites", filename + ".html"), "w") as out:
-            out.write(r.text)
-
        not_here = find_not_here_message(r.text)
        if (
            len(r.text) < 2048