diff --git a/get.py b/get.py new file mode 100755 index 0000000..7b1718b --- /dev/null +++ b/get.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 + +import requests +from http.cookiejar import LWPCookieJar +import os +import re +import lxml.html +from random import shuffle +from time import sleep + +re_recommend = re.compile(' <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">') + +s = requests.Session() + +cookie_dir = '/home/edward/lib/cookies' +cookie_file = os.path.join(cookie_dir, 'goodreads') + +cj = LWPCookieJar(cookie_file) +if os.path.exists(cookie_file): + cj.load() +s.cookies = cj + +def login(): + sign_in_page = 'https://www.goodreads.com/user/sign_in' + page = s.get(sign_in_page).text + open('sign_in.html', 'w').write(page) + if '"name":"Edward Betts"' in page: + return # already signed in + + re_token = re.compile('<input type="hidden" name="authenticity_token" value="([^"]*?)" />') + re_n = re.compile("<input name='n' type='hidden' value='(\d+)'>") + + token = re_token.search(page).group(1) + + data = { + 'utf8': u'\u2713', + 'authenticity_token': token, + 'user[email]': 'edward@4angle.com', + 'user[password]': '8V8~9:3~U!Ly', + 'remember_me': 1, + 'next': 'Sign in', + 'n': re_n.search(page).group(1), + } + + print(token) + print(data['n']) + + r = s.post(sign_in_page, data=data, headers={'referer': sign_in_page}) + + open('signed_in.html', 'w').write(r.text) + + root = lxml.html.fromstring(r.content) + flash = root.find_class('flash') + if flash: + print('flash:', flash[0].text) + + cj.save(ignore_discard=True) + +def get_index(): + # url = 'https://www.goodreads.com/recommendations' + url = 'https://www.goodreads.com/recommendations/?recs_current_view=list' + + r = s.get(url) + open('recommendations.html', 'w').write(r.text) + +def get_individual(): + for line in open('recommendations.html'): + if 'actionLinkLite' not in line: + continue + m = re_recommend.match(line) + if m: + yield m.groups() + + +# art = 'https://www.goodreads.com/recommendations/genre/art' +login() +get_index() +recommend_list = list(get_individual()) +shuffle(recommend_list) + +headers = {'Accept': 'text/html'} + +for a, b, c in recommend_list: + print((b, c)) + url = 'https://www.goodreads.com' + a + + r = s.get(url, headers=headers) + filename = os.path.join(b, c + '.html') + open(filename, 'w').write(r.text) + sleep(0.5) diff --git a/parse.py b/parse.py new file mode 100755 index 0000000..56efc73 --- /dev/null +++ b/parse.py @@ -0,0 +1,131 @@ +#!/usr/bin/python3 +import os +import re +import lxml.html +import jinja2 +import json +import sys +import requests +from time import sleep +from pprint import pprint + +parser = lxml.html.HTMLParser(encoding='utf-8') + +env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')) + +re_book = re.compile('function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}', re.DOTALL) +re_book = re.compile('function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}', re.DOTALL) + +re_tip = re.compile(br'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') + +dirs = ['shelf', 'genre'] +start = 'https://www.goodreads.com/book/show/' + +existing = {book['title'] for book in json.load(open('calibre_book_list.json'))} + +def iter_books(): + for d in dirs: + for f in sorted(os.listdir(d)): + filename = os.path.join(d, f) + root = lxml.html.parse(filename, parser=parser).getroot() + for div in root.find_class('bookInformation'): + link = div.find('.//a') + rating = div.find('.//span[@class="minirating"]') + description = div.find('./div[@class="bookDescription"]') + r = rating[0].tail.strip() + + cover = div.getnext().find('.//img').get('src') + + book = { + 'title': link.text, + 'url': link.get('href'), + 'rating': r, + 'r': float(r[:3]), + 'cover': cover, + 'authors': [a.text for a in div.find_class('authorName')], + } + if description is not None: + index = 1 if len(description) == 3 else 0 + book['description'] = description[index].text + yield d, f, book + continue + + # print(filename) + for line in open(filename, 'rb'): + if b'var newTip' not in line: + continue + print(line) + m = re_tip.search(line) + tip = m.group(1).decode('unicode_escape').replace('\/', '/') + # tip = m.group(1) # .replace('\/', '/') + # print(tip) + if '<ul class="formatting_tips recommendation_tip">' in tip: + continue + if 'Recommendations are disabled for that shelf.' in tip: + continue + if 'Customize by selecting your' in tip: + continue + print(tip) + yield (d, f, lxml.html.fromstring(tip)) + + +template = env.get_template('books.html') +seen = set() +books = [] +first_authors = set() +for d, f, book in sorted(iter_books(), key=lambda i: i[2]['r'], reverse=True): + # pprint(book) + # print(repr(book.get('description'))) + # continue + + # title_link = book.find_class('bookTitle')[0] + url = book['url'] + url = url[:url.find('?')] + if url in seen: + continue + seen.add(url) + title = book['title'] + authors = book['authors'] + first_authors.add(authors[0]) + main_title = title + # for sep in ['(']: + for sep in ':', ' - ', '(', ',': + if sep in title: + main_title = title[:title.find(sep)] + break + # print((main_title + ' by ' + u', '.join(authors)).encode('utf-8')) + if len(main_title) < 10: + continue + if main_title in existing: + continue + # print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8')) + print(u'{}'.format(main_title)) + # print(main_title.encode('utf-8')) + assert url.startswith(start) + + filename = 'books/' + url[len(start):] + '.html' + # print(filename) + if False and not os.path.exists(filename): + open(filename, 'w').write(requests.get(url).content) + sleep(1) + books.append({ + 'dir': d, + 'file': f[:-5], + 'title': title, + 'main_title': main_title, + 'authors': authors, + 'url': url, + 'rating': book['rating'], + 'cover': book['cover'], + 'description': book.get('description'), + }) + +page = template.render(books=books) +open('book_list.html', 'w').write(page) +sys.exit(0) + +for a in sorted(first_authors): + print(a) + +# authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'}) +# print(authors.encode('utf-8')) diff --git a/templates/books.html b/templates/books.html new file mode 100644 index 0000000..5ec1928 --- /dev/null +++ b/templates/books.html @@ -0,0 +1,39 @@ +<html> +<meta charset="utf-8"> +<table> +{% for book in books %} +{% set url_title = ('"' + book.main_title.replace('.', ' ') + '"') | urlencode %} +{% set title_and_author = ('"' + book.main_title + '" ' + book.authors.0) | urlencode %} +<tr> +<td rowspan="3"><img src="{{ book.cover }}" /></td> +<td nowrap="nowrap"> +<a href="http://1337x.to/search/{{- title_and_author -}}/1/">1337x</a> +{# | <a href="https://www.google.co.uk/search?q=site%3Aforum.mobilism.org+{{- title_and_author -}}">mobilism</a> #} +| <a href="https://forum.mobilism.org/search.php?sr=topics&keywords={{- title_and_author -}}">mobilism</a> +| <a href="http://en.booksee.org/s/?q={{- title_and_author -}}">booksee</a> +| <a href="http://gen.lib.rus.ec/search.php?req={{- title_and_author -}}">Library Genesis</a> +| <a href="http://libgen.io/foreignfiction/index.php?s={{- book.authors.0 | urlencode -}}">Library Genesis Fiction</a> +</td> +<td nowrap="nowrap"> +{{ book.dir }} / {{ book.file }} +</td> +<td nowrap="nowrap"> +{{ book.rating }} +</td> +</tr> +<tr> +<td colspan="3"><a href="{{ book.url }}">{{ book.title }}</a> +by {% for a in book.authors %}{{ a -}}{% if not loop.last %}, {% endif %}{% endfor %}<br> +</td> +</tr> + <tr> + <td colspan="3"> +{% if book.description %} + {% for p in book.description.split('\xa0') %} + <p>{{ p }}</p> + {% endfor %} +{% endif %} + </td> + </tr> +{% endfor %} +</html>