#!/usr/bin/python3 """Parse shelves and output web page.""" import json import os import re import sys import typing from time import sleep import jinja2 import lxml.html import requests parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( r"function refreshGroupBox$group_id, book_id$ \{(.*?)\n *\}", re.DOTALL ) re_tip = re.compile(r'var newTip = new Tip$\$\(\'[^\']+\'$, "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]: """Parse book div.""" link = div.find(".//a") assert link is not None rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') assert rating is not None and rating[0] is not None r = rating[0].tail assert r div_next = div.getnext() assert div_next is not None cover_img = div_next.find(".//img") assert cover_img is not None cover = cover_img.get("src") book = { "title": link.text, "url": link.get("href"), "rating": r, "r": float(r[:3].strip()), "cover": cover, "authors": [a.text for a in div.find_class("authorName")], } if description is not None: index = 1 if len(description) == 3 else 0 book["description"] = description[index].text return book def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): book = parse_book_div(div) yield d, f, book continue # print(filename) for line in open(filename, "rb"): if b"var newTip" not in line: continue print(line) m = re_tip.search(line) tip = m.group(1).decode("unicode_escape").replace(r"\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '

' in tip: continue if "Recommendations are disabled for that shelf." in tip: continue if "Customize by selecting your" in tip: continue print(tip) yield (d, f, lxml.html.fromstring(tip)) def main() -> None: """Download books.""" template = env.get_template("books.html") seen = set() books = [] first_authors = set() for d, f, book in sorted(iter_books(), key=lambda i: i[2]["r"], reverse=True): # pprint(book) # print(repr(book.get('description'))) # continue # title_link = book.find_class('bookTitle')[0] url = book["url"] url = url[: url.find("?")] if url in seen: continue seen.add(url) title = book["title"] authors = book["authors"] first_authors.add(authors[0]) main_title = title # for sep in ['(']: for sep in ":", " - ", "(", ",": if sep in title: main_title = title[: title.find(sep)] break # print((main_title + ' by ' + u', '.join(authors)).encode('utf-8')) if len(main_title) < 10: continue if main_title in existing: continue # print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8')) print("{}".format(main_title)) # print(main_title.encode('utf-8')) assert url.startswith(start) filename = "books/" + url[len(start) :] + ".html" # print(filename) if False and not os.path.exists(filename): open(filename, "w").write(requests.get(url).content) sleep(1) books.append( { "dir": d, "file": f[:-5], "title": title, "main_title": main_title, "authors": authors, "url": url, "rating": book["rating"], "cover": book["cover"], "description": book.get("description"), } ) page = template.render(books=books) open("book_list.html", "w").write(page) sys.exit(0) for a in sorted(first_authors): print(a) # authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) # if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'}) # print(authors.encode('utf-8')) if __name__ == "__main__": main()