diff --git a/get.py b/get.py index 150cc35..e654472 100755 --- a/get.py +++ b/get.py @@ -1,16 +1,13 @@ #!/usr/bin/python3 -"""Download shelves from goodreads.""" - +import requests +from http.cookiejar import LWPCookieJar import os import re -from http.cookiejar import LWPCookieJar +import lxml.html from random import shuffle from time import sleep -import lxml.html -import requests - re_recommend = re.compile( ' ' ) @@ -23,11 +20,10 @@ cookie_file = os.path.join(cookie_dir, "goodreads") cj = LWPCookieJar(cookie_file) if os.path.exists(cookie_file): cj.load() -s.cookies = cj # type: ignore +s.cookies = cj -def login() -> None: - """Login.""" +def login(): sign_in_page = "https://www.goodreads.com/user/sign_in" page = s.get(sign_in_page).text open("sign_in.html", "w").write(page) @@ -37,12 +33,9 @@ def login() -> None: re_token = re.compile( '' ) - re_n = re.compile(r"") - m_n = re_n.search(page) - m_token = re_token.search(page) + re_n = re.compile("") - assert m_token and m_n - token = m_token.group(1) + token = re_token.search(page).group(1) data = { "utf8": "\u2713", @@ -51,7 +44,7 @@ def login() -> None: "user[password]": "8V8~9:3~U!Ly", "remember_me": 1, "next": "Sign in", - "n": m_n.group(1), + "n": re_n.search(page).group(1), } print(token) @@ -69,8 +62,7 @@ def login() -> None: cj.save(ignore_discard=True) -def get_index() -> None: - """Get index.""" +def get_index(): # url = 'https://www.goodreads.com/recommendations' url = "https://www.goodreads.com/recommendations/?recs_current_view=list" @@ -79,7 +71,6 @@ def get_index() -> None: def get_individual(): - """Get individual page.""" for line in open("recommendations.html"): if "actionLinkLite" not in line: continue @@ -88,25 +79,19 @@ def get_individual(): yield m.groups() -def main() -> None: - """Login and download shelves.""" - # art = 'https://www.goodreads.com/recommendations/genre/art' - login() - get_index() - recommend_list = list(get_individual()) - shuffle(recommend_list) +# art = 'https://www.goodreads.com/recommendations/genre/art' +login() +get_index() +recommend_list = list(get_individual()) +shuffle(recommend_list) - headers = {"Accept": "text/html"} +headers = {"Accept": "text/html"} - for a, b, c in recommend_list: - print((b, c)) - url = "https://www.goodreads.com" + a +for a, b, c in recommend_list: + print((b, c)) + url = "https://www.goodreads.com" + a - r = s.get(url, headers=headers) - filename = os.path.join(b, c + ".html") - open(filename, "w").write(r.text) - sleep(0.5) - - -if __name__ == "__main__": - main() + r = s.get(url, headers=headers) + filename = os.path.join(b, c + ".html") + open(filename, "w").write(r.text) + sleep(0.5) diff --git a/parse.py b/parse.py index f2da18b..f0f6294 100755 --- a/parse.py +++ b/parse.py @@ -1,28 +1,26 @@ #!/usr/bin/python3 - -import json import os import re -import sys -import typing -from time import sleep - -import jinja2 import lxml.html +import jinja2 +import json +import sys import requests +from time import sleep +from pprint import pprint parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( - r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL + "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( - r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL + "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL ) -re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') +re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" @@ -30,44 +28,30 @@ start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} -def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]: - """Parse book div.""" - link = div.find(".//a") - assert link is not None - rating = div.find('.//span[@class="minirating"]') - description = div.find('./div[@class="bookDescription"]') - assert rating is not None and rating[0] is not None - r = rating[0].tail - assert r - - div_next = div.getnext() - assert div_next is not None - cover_img = div_next.find(".//img") - assert cover_img is not None - cover = cover_img.get("src") - - book = { - "title": link.text, - "url": link.get("href"), - "rating": r, - "r": float(r[:3].strip()), - "cover": cover, - "authors": [a.text for a in div.find_class("authorName")], - } - if description is not None: - index = 1 if len(description) == 3 else 0 - book["description"] = description[index].text - - return book - - def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): - book = parse_book_div(div) + link = div.find(".//a") + rating = div.find('.//span[@class="minirating"]') + description = div.find('./div[@class="bookDescription"]') + r = rating[0].tail.strip() + + cover = div.getnext().find(".//img").get("src") + + book = { + "title": link.text, + "url": link.get("href"), + "rating": r, + "r": float(r[:3]), + "cover": cover, + "authors": [a.text for a in div.find_class("authorName")], + } + if description is not None: + index = 1 if len(description) == 3 else 0 + book["description"] = description[index].text yield d, f, book continue @@ -77,7 +61,7 @@ def iter_books(): continue print(line) m = re_tip.search(line) - tip = m.group(1).decode("unicode_escape").replace(r"\/", "/") + tip = m.group(1).decode("unicode_escape").replace("\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '