2 changed files with 106 additions and 144 deletions
--- a/get.py
+++ b/get.py
@ -1,16 +1,13 @@
 #!/usr/bin/python3
-"""Download shelves from goodreads."""
+import requests
-
+from http.cookiejar import LWPCookieJar
 import os
 import re
-from http.cookiejar import LWPCookieJar
+import lxml.html
 from random import shuffle
 from time import sleep
 import lxml.html
 import requests
 re_recommend = re.compile(
    '  <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">'
 )
@ -23,11 +20,10 @@ cookie_file = os.path.join(cookie_dir, "goodreads")
 cj = LWPCookieJar(cookie_file)
 if os.path.exists(cookie_file):
    cj.load()
-s.cookies = cj  # type: ignore
+s.cookies = cj
-def login() -> None:
+def login():
    """Login."""
    sign_in_page = "https://www.goodreads.com/user/sign_in"
    page = s.get(sign_in_page).text
    open("sign_in.html", "w").write(page)
@ -37,12 +33,9 @@ def login() -> None:
    re_token = re.compile(
        '<input type="hidden" name="authenticity_token" value="([^"]*?)" />'
    )
-    re_n = re.compile(r"<input name='n' type='hidden' value='(\d+)'>")
+    re_n = re.compile("<input name='n' type='hidden' value='(\d+)'>")
    m_n = re_n.search(page)
    m_token = re_token.search(page)
-    assert m_token and m_n
+    token = re_token.search(page).group(1)
    token = m_token.group(1)
    data = {
        "utf8": "\u2713",
@ -51,7 +44,7 @@ def login() -> None:
        "user[password]": "8V8~9:3~U!Ly",
        "remember_me": 1,
        "next": "Sign in",
-        "n": m_n.group(1),
+        "n": re_n.search(page).group(1),
    }
    print(token)
@ -69,8 +62,7 @@ def login() -> None:
    cj.save(ignore_discard=True)
-def get_index() -> None:
+def get_index():
    """Get index."""
    # url = 'https://www.goodreads.com/recommendations'
    url = "https://www.goodreads.com/recommendations/?recs_current_view=list"
@ -79,7 +71,6 @@ def get_index() -> None:
 def get_individual():
    """Get individual page."""
    for line in open("recommendations.html"):
        if "actionLinkLite" not in line:
            continue
@ -88,25 +79,19 @@ def get_individual():
            yield m.groups()
-def main() -> None:
+# art = 'https://www.goodreads.com/recommendations/genre/art'
-    """Login and download shelves."""
+login()
-    # art = 'https://www.goodreads.com/recommendations/genre/art'
+get_index()
-    login()
+recommend_list = list(get_individual())
-    get_index()
+shuffle(recommend_list)
    recommend_list = list(get_individual())
    shuffle(recommend_list)
-    headers = {"Accept": "text/html"}
+headers = {"Accept": "text/html"}
-    for a, b, c in recommend_list:
+for a, b, c in recommend_list:
-        print((b, c))
+    print((b, c))
-        url = "https://www.goodreads.com" + a
+    url = "https://www.goodreads.com" + a
-        r = s.get(url, headers=headers)
+    r = s.get(url, headers=headers)
-        filename = os.path.join(b, c + ".html")
+    filename = os.path.join(b, c + ".html")
-        open(filename, "w").write(r.text)
+    open(filename, "w").write(r.text)
-        sleep(0.5)
+    sleep(0.5)
 if __name__ == "__main__":
    main()
--- a/parse.py
+++ b/parse.py
@ -1,28 +1,26 @@
 #!/usr/bin/python3
 import json
 import os
 import re
 import sys
 import typing
 from time import sleep
 import jinja2
 import lxml.html
 import jinja2
 import json
 import sys
 import requests
 from time import sleep
 from pprint import pprint
 parser = lxml.html.HTMLParser(encoding="utf-8")
 env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))
 re_book = re.compile(
-    r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
+    "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL
 )
 re_book = re.compile(
-    r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
+    "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL
 )
-re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
+re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {')
 dirs = ["shelf", "genre"]
 start = "https://www.goodreads.com/book/show/"
@ -30,44 +28,30 @@ start = "https://www.goodreads.com/book/show/"
 existing = {book["title"] for book in json.load(open("calibre_book_list.json"))}
 def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]:
    """Parse book div."""
    link = div.find(".//a")
    assert link is not None
    rating = div.find('.//span[@class="minirating"]')
    description = div.find('./div[@class="bookDescription"]')
    assert rating is not None and rating[0] is not None
    r = rating[0].tail
    assert r
    div_next = div.getnext()
    assert div_next is not None
    cover_img = div_next.find(".//img")
    assert cover_img is not None
    cover = cover_img.get("src")
    book = {
        "title": link.text,
        "url": link.get("href"),
        "rating": r,
        "r": float(r[:3].strip()),
        "cover": cover,
        "authors": [a.text for a in div.find_class("authorName")],
    }
    if description is not None:
        index = 1 if len(description) == 3 else 0
        book["description"] = description[index].text
    return book
 def iter_books():
    for d in dirs:
        for f in sorted(os.listdir(d)):
            filename = os.path.join(d, f)
            root = lxml.html.parse(filename, parser=parser).getroot()
            for div in root.find_class("bookInformation"):
-                book = parse_book_div(div)
+                link = div.find(".//a")
                rating = div.find('.//span[@class="minirating"]')
                description = div.find('./div[@class="bookDescription"]')
                r = rating[0].tail.strip()
                cover = div.getnext().find(".//img").get("src")
                book = {
                    "title": link.text,
                    "url": link.get("href"),
                    "rating": r,
                    "r": float(r[:3]),
                    "cover": cover,
                    "authors": [a.text for a in div.find_class("authorName")],
                }
                if description is not None:
                    index = 1 if len(description) == 3 else 0
                    book["description"] = description[index].text
                yield d, f, book
            continue
@ -77,7 +61,7 @@ def iter_books():
                    continue
                print(line)
                m = re_tip.search(line)
-                tip = m.group(1).decode("unicode_escape").replace(r"\/", "/")
+                tip = m.group(1).decode("unicode_escape").replace("\/", "/")
                # tip = m.group(1)  # .replace('\/', '/')
                # print(tip)
                if '<ul class="formatting_tips recommendation_tip">' in tip:
@ -90,72 +74,65 @@ def iter_books():
                yield (d, f, lxml.html.fromstring(tip))
-def main() -> None:
+template = env.get_template("books.html")
-    """Download books."""
+seen = set()
-    template = env.get_template("books.html")
+books = []
-    seen = set()
+first_authors = set()
-    books = []
+for d, f, book in sorted(iter_books(), key=lambda i: i[2]["r"], reverse=True):
-    first_authors = set()
+    # pprint(book)
-    for d, f, book in sorted(iter_books(), key=lambda i: i[2]["r"], reverse=True):
+    # print(repr(book.get('description')))
-        # pprint(book)
+    # continue
        # print(repr(book.get('description')))
        # continue
-        # title_link = book.find_class('bookTitle')[0]
+    # title_link = book.find_class('bookTitle')[0]
-        url = book["url"]
+    url = book["url"]
-        url = url[: url.find("?")]
+    url = url[: url.find("?")]
-        if url in seen:
+    if url in seen:
-            continue
+        continue
-        seen.add(url)
+    seen.add(url)
-        title = book["title"]
+    title = book["title"]
-        authors = book["authors"]
+    authors = book["authors"]
-        first_authors.add(authors[0])
+    first_authors.add(authors[0])
-        main_title = title
+    main_title = title
-        # for sep in ['(']:
+    # for sep in ['(']:
-        for sep in ":", " - ", "(", ",":
+    for sep in ":", " - ", "(", ",":
-            if sep in title:
+        if sep in title:
-                main_title = title[: title.find(sep)]
+            main_title = title[: title.find(sep)]
-                break
+            break
-        # print((main_title + ' by ' + u', '.join(authors)).encode('utf-8'))
+    # print((main_title + ' by ' + u', '.join(authors)).encode('utf-8'))
-        if len(main_title) < 10:
+    if len(main_title) < 10:
-            continue
+        continue
-        if main_title in existing:
+    if main_title in existing:
-            continue
+        continue
-        # print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8'))
+    # print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8'))
-        print("{}".format(main_title))
+    print("{}".format(main_title))
-        # print(main_title.encode('utf-8'))
+    # print(main_title.encode('utf-8'))
-        assert url.startswith(start)
+    assert url.startswith(start)
-        filename = "books/" + url[len(start) :] + ".html"
+    filename = "books/" + url[len(start) :] + ".html"
-        # print(filename)
+    # print(filename)
-        if False and not os.path.exists(filename):
+    if False and not os.path.exists(filename):
-            open(filename, "w").write(requests.get(url).content)
+        open(filename, "w").write(requests.get(url).content)
-            sleep(1)
+        sleep(1)
-        books.append(
+    books.append(
-            {
+        {
-                "dir": d,
+            "dir": d,
-                "file": f[:-5],
+            "file": f[:-5],
-                "title": title,
+            "title": title,
-                "main_title": main_title,
+            "main_title": main_title,
-                "authors": authors,
+            "authors": authors,
-                "url": url,
+            "url": url,
-                "rating": book["rating"],
+            "rating": book["rating"],
-                "cover": book["cover"],
+            "cover": book["cover"],
-                "description": book.get("description"),
+            "description": book.get("description"),
-            }
+        }
-        )
+    )
-    page = template.render(books=books)
+page = template.render(books=books)
-    open("book_list.html", "w").write(page)
+open("book_list.html", "w").write(page)
-    sys.exit(0)
+sys.exit(0)
-    for a in sorted(first_authors):
+for a in sorted(first_authors):
-        print(a)
+    print(a)
-    # authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors)
+# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
-    # if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'})
+# print(authors.encode('utf-8'))
    # print(authors.encode('utf-8'))
 if __name__ == "__main__":
    main()