#!/usr/bin/python3 import os import re import lxml.html import jinja2 import json import sys import requests from time import sleep from pprint import pprint parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( "function refreshGroupBox$group_id, book_id$ \{(.*?)\n *\}", re.DOTALL ) re_tip = re.compile(rb'var newTip = new Tip$\$\(\'[^\']+\'$, "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): link = div.find(".//a") rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') r = rating[0].tail.strip() cover = div.getnext().find(".//img").get("src") book = { "title": link.text, "url": link.get("href"), "rating": r, "r": float(r[:3]), "cover": cover, "authors": [a.text for a in div.find_class("authorName")], } if description is not None: index = 1 if len(description) == 3 else 0 book["description"] = description[index].text yield d, f, book continue # print(filename) for line in open(filename, "rb"): if b"var newTip" not in line: continue print(line) m = re_tip.search(line) tip = m.group(1).decode("unicode_escape").replace("\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '

' in tip: continue if "Recommendations are disabled for that shelf." in tip: continue if "Customize by selecting your" in tip: continue print(tip) yield (d, f, lxml.html.fromstring(tip)) template = env.get_template("books.html") seen = set() books = [] first_authors = set() for d, f, book in sorted(iter_books(), key=lambda i: i[2]["r"], reverse=True): # pprint(book) # print(repr(book.get('description'))) # continue # title_link = book.find_class('bookTitle')[0] url = book["url"] url = url[: url.find("?")] if url in seen: continue seen.add(url) title = book["title"] authors = book["authors"] first_authors.add(authors[0]) main_title = title # for sep in ['(']: for sep in ":", " - ", "(", ",": if sep in title: main_title = title[: title.find(sep)] break # print((main_title + ' by ' + u', '.join(authors)).encode('utf-8')) if len(main_title) < 10: continue if main_title in existing: continue # print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8')) print("{}".format(main_title)) # print(main_title.encode('utf-8')) assert url.startswith(start) filename = "books/" + url[len(start) :] + ".html" # print(filename) if False and not os.path.exists(filename): open(filename, "w").write(requests.get(url).content) sleep(1) books.append( { "dir": d, "file": f[:-5], "title": title, "main_title": main_title, "authors": authors, "url": url, "rating": book["rating"], "cover": book["cover"], "description": book.get("description"), } ) page = template.render(books=books) open("book_list.html", "w").write(page) sys.exit(0) for a in sorted(first_authors): print(a) # authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'}) # print(authors.encode('utf-8'))