diff --git a/get.py b/get.py index e654472..150cc35 100755 --- a/get.py +++ b/get.py @@ -1,13 +1,16 @@ #!/usr/bin/python3 -import requests -from http.cookiejar import LWPCookieJar +"""Download shelves from goodreads.""" + import os import re -import lxml.html +from http.cookiejar import LWPCookieJar from random import shuffle from time import sleep +import lxml.html +import requests + re_recommend = re.compile( ' ' ) @@ -20,10 +23,11 @@ cookie_file = os.path.join(cookie_dir, "goodreads") cj = LWPCookieJar(cookie_file) if os.path.exists(cookie_file): cj.load() -s.cookies = cj +s.cookies = cj # type: ignore -def login(): +def login() -> None: + """Login.""" sign_in_page = "https://www.goodreads.com/user/sign_in" page = s.get(sign_in_page).text open("sign_in.html", "w").write(page) @@ -33,9 +37,12 @@ def login(): re_token = re.compile( '' ) - re_n = re.compile("") + re_n = re.compile(r"") + m_n = re_n.search(page) + m_token = re_token.search(page) - token = re_token.search(page).group(1) + assert m_token and m_n + token = m_token.group(1) data = { "utf8": "\u2713", @@ -44,7 +51,7 @@ def login(): "user[password]": "8V8~9:3~U!Ly", "remember_me": 1, "next": "Sign in", - "n": re_n.search(page).group(1), + "n": m_n.group(1), } print(token) @@ -62,7 +69,8 @@ def login(): cj.save(ignore_discard=True) -def get_index(): +def get_index() -> None: + """Get index.""" # url = 'https://www.goodreads.com/recommendations' url = "https://www.goodreads.com/recommendations/?recs_current_view=list" @@ -71,6 +79,7 @@ def get_index(): def get_individual(): + """Get individual page.""" for line in open("recommendations.html"): if "actionLinkLite" not in line: continue @@ -79,19 +88,25 @@ def get_individual(): yield m.groups() -# art = 'https://www.goodreads.com/recommendations/genre/art' -login() -get_index() -recommend_list = list(get_individual()) -shuffle(recommend_list) +def main() -> None: + """Login and download shelves.""" + # art = 'https://www.goodreads.com/recommendations/genre/art' + login() + get_index() + recommend_list = list(get_individual()) + shuffle(recommend_list) -headers = {"Accept": "text/html"} + headers = {"Accept": "text/html"} -for a, b, c in recommend_list: - print((b, c)) - url = "https://www.goodreads.com" + a + for a, b, c in recommend_list: + print((b, c)) + url = "https://www.goodreads.com" + a - r = s.get(url, headers=headers) - filename = os.path.join(b, c + ".html") - open(filename, "w").write(r.text) - sleep(0.5) + r = s.get(url, headers=headers) + filename = os.path.join(b, c + ".html") + open(filename, "w").write(r.text) + sleep(0.5) + + +if __name__ == "__main__": + main() diff --git a/parse.py b/parse.py index f0f6294..f2da18b 100755 --- a/parse.py +++ b/parse.py @@ -1,26 +1,28 @@ #!/usr/bin/python3 + +import json import os import re -import lxml.html -import jinja2 -import json import sys -import requests +import typing from time import sleep -from pprint import pprint + +import jinja2 +import lxml.html +import requests parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( - "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL + r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( - "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL + r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL ) -re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') +re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" @@ -28,30 +30,44 @@ start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} +def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]: + """Parse book div.""" + link = div.find(".//a") + assert link is not None + rating = div.find('.//span[@class="minirating"]') + description = div.find('./div[@class="bookDescription"]') + assert rating is not None and rating[0] is not None + r = rating[0].tail + assert r + + div_next = div.getnext() + assert div_next is not None + cover_img = div_next.find(".//img") + assert cover_img is not None + cover = cover_img.get("src") + + book = { + "title": link.text, + "url": link.get("href"), + "rating": r, + "r": float(r[:3].strip()), + "cover": cover, + "authors": [a.text for a in div.find_class("authorName")], + } + if description is not None: + index = 1 if len(description) == 3 else 0 + book["description"] = description[index].text + + return book + + def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): - link = div.find(".//a") - rating = div.find('.//span[@class="minirating"]') - description = div.find('./div[@class="bookDescription"]') - r = rating[0].tail.strip() - - cover = div.getnext().find(".//img").get("src") - - book = { - "title": link.text, - "url": link.get("href"), - "rating": r, - "r": float(r[:3]), - "cover": cover, - "authors": [a.text for a in div.find_class("authorName")], - } - if description is not None: - index = 1 if len(description) == 3 else 0 - book["description"] = description[index].text + book = parse_book_div(div) yield d, f, book continue @@ -61,7 +77,7 @@ def iter_books(): continue print(line) m = re_tip.search(line) - tip = m.group(1).decode("unicode_escape").replace("\/", "/") + tip = m.group(1).decode("unicode_escape").replace(r"\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '