User cloudscraper

Add more strings to not_here_list.
2025-01-11 16:43:51 +00:00 · 2024-11-01 09:23:18 +00:00
3 changed files with 46 additions and 129 deletions
--- a/check.py
+++ b/check.py
@ -7,13 +7,12 @@ import os
 import re
 import smtplib
 import warnings
 from dataclasses import dataclass
 from datetime import date
 from email.mime.text import MIMEText
 from email.utils import formatdate, make_msgid
 from urllib.parse import urlparse, urlunparse
-import cloudscraper  # type: ignore
+import cloudscraper
 import requests
 import yaml
 from requests.adapters import HTTPAdapter
@ -77,7 +76,6 @@ not_here_list = [
    "This page does not exist yet",
    "404 Not Found",
    "500 Internal Server Error",
    "500: Internal Server Error",
    "Test Page for the Apache HTTP Server",
    "Site not found &middot; GitHub Pages",
    "504: Gateway time-out",
@ -137,118 +135,31 @@ def normalize_url(url: str) -> str:
    return urlunparse(parsed_url._replace(netloc=normalized_netloc))
-def url_to_filename(url: str) -> str:
+def check_conference(
-    """
+    name: str, src_url: str, year: int
-    Convert a URL to a valid filename by replacing invalid characters with underscores.
+) -> tuple[bool, str, str | None]:
    """Check if conference is live."""
    url = src_url.format(year=year)
    past_url = src_url.format(year=year - 1)
    no_dot = {"bsideskbh.dk", "pif.camp"}
    session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
    try:
        r = session.get(url)
    except requests.exceptions.ConnectionError:
        return (False, "connection refused", None)
-    Args:
+    not_here = find_not_here_message(r.text)
-        url (str): The URL to be converted.
+    if (
        len(r.text) < 2048
        and 'http-equiv="refresh"' in r.text
        and str(year) not in r.text
    ):
        return (False, "redirect to URL without year", r.url)
-    Returns:
+    if normalize_url(r.url) == normalize_url(past_url):
-        str: A valid filename.
+        return (False, "redirect to previous year", r.url)
    """
    # Replace invalid characters with underscores
    return re.sub(r'[\/:*?"<>|]', "_", url)
-
+    return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
 def parse_opengraph_tags(html: str) -> dict[str, str]:
    """
    Locate Open Graph meta tags, and return them as a dictionary.
    Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
    """
    pattern = re.compile(
        r'<meta\s+property="og:([^"]+)"\s+content="([^"]+)"\s*/?>', flags=re.IGNORECASE
    )
    matches = pattern.findall(html)
    og_tags = {}
    for prop, content in matches:
        og_tags[prop] = content
    return og_tags
@dataclass
 class Conference:
    """Conference."""
    name: str
    src_url: str
    year: int
    response: requests.models.Response | None = None
    redirect_to_url: str | None = None
    @property
    def url(self) -> str:
        """Conference URL."""
        return self.build_url(year=self.year)
    def build_url(self, year: int) -> str:
        """Build conference URL."""
        return self.src_url.format(year=year, two_digit_year=year % 2000)
    @property
    def past_url(self) -> str:
        """URL for previous year."""
        return self.build_url(year=self.year - 1)
    def check(self) -> tuple[bool, str, str | None]:
        """Check if conference is live."""
        no_dot = {"bsideskbh.dk", "pif.camp"}
        url = self.url
        session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
        try:
            r = session.get(url)
        except requests.exceptions.ConnectionError:
            return (False, "connection refused", None)
        self.response = r
        not_here = find_not_here_message(r.text)
        if (
            len(r.text) < 2048
            and 'http-equiv="refresh"' in r.text
            and str(self.year) not in r.text
        ):
            return (False, "redirect to URL without year", r.url)
        if normalize_url(r.url) == normalize_url(self.past_url):
            return (False, "redirect to previous year", r.url)
        if not_here:
            return (False, not_here, r.url)
        return (True, get_title(r.text), r.url)
    def og_tags(self) -> dict[str, str]:
        """Open Graph tags."""
        return parse_opengraph_tags(self.response.text) if self.response else {}
    def check_web_site(self) -> bool:
        """Check if an individual web site is live."""
        assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url
        live, msg, redirect_to_url = self.check()
        if not live:
            return False
        og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
        if og:
            og = "\n\nOpen Graph\n\n" + og
        if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
            self.url
        ):
            body = f"{self.name}\n{self.url}\n"
        else:
            self.redirect_to_url = redirect_to_url
            body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
        body += f"Web page title: {msg}{og}" ""
        send_mail(f"Conference site live: {self.name} - {self.year}", body)
        return True
 def send_mail(subject: str, body: str) -> None:
@ -274,6 +185,23 @@ def send_mail(subject: str, body: str) -> None:
    s.quit()
 def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
    """Check if an individual web site is live."""
    assert "{year}" in src_url
    live, msg, redirect_to_url = check_conference(name, src_url, year)
    url = src_url.format(year=year)
    if live:
        if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url):
            body = f"{name}\n{url}\nWeb page title: {msg}"
        else:
            body = f"""{name}
 {url} redirects to {redirect_to_url}
 Web page title: {msg}"""
        send_mail(f"Conference site live: {name} - {year}", body)
    return live
 def find_new_conference_web_sites(
    today: date, live: list[LiveConference]
 ) -> list[LiveConference]:
@ -284,21 +212,12 @@ def find_new_conference_web_sites(
    live_set = {(c["conference"], c["year"]) for c in live}
    for name, src_url in load_yaml("conferences").items():
-        for year in (this_year, this_year + 1):
+        new += [
-            if (name, year) in live_set:
+            {"conference": name, "year": year, "live": today}
-                continue
+            for year in (this_year, this_year + 1)
-            conf = Conference(name, src_url, year)
+            if (name, year) not in live_set
-            if not conf.check_web_site():
+            and check_conference_web_site(name, src_url, year)
-                continue
+        ]
            c: LiveConference = {
                "conference": name,
                "year": year,
                "live": today,
                "url": conf.url,
            }
            if conf.redirect_to_url:
                c["redirect_to_url"] = conf.redirect_to_url
            new.append(c)
    return new
--- a/conference/init.py
+++ b/conference/init.py
@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False):
    year: int
    live: date
    url: str | None
    redirect_to_url: str | None
 def load_yaml(name: str) -> typing.Any:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1 @@
 requests
 cloudscraper
Author	SHA1	Message	Date
Edward Betts	fee54a37e7	User cloudscraper	2025-01-11 16:43:51 +00:00
Edward Betts	35c213110d	Add more strings to not_here_list.	2024-11-01 09:23:18 +00:00