diff --git a/get.py b/get.py new file mode 100755 index 0000000..7b1718b --- /dev/null +++ b/get.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 + +import requests +from http.cookiejar import LWPCookieJar +import os +import re +import lxml.html +from random import shuffle +from time import sleep + +re_recommend = re.compile(' ') + +s = requests.Session() + +cookie_dir = '/home/edward/lib/cookies' +cookie_file = os.path.join(cookie_dir, 'goodreads') + +cj = LWPCookieJar(cookie_file) +if os.path.exists(cookie_file): + cj.load() +s.cookies = cj + +def login(): + sign_in_page = 'https://www.goodreads.com/user/sign_in' + page = s.get(sign_in_page).text + open('sign_in.html', 'w').write(page) + if '"name":"Edward Betts"' in page: + return # already signed in + + re_token = re.compile('') + re_n = re.compile("") + + token = re_token.search(page).group(1) + + data = { + 'utf8': u'\u2713', + 'authenticity_token': token, + 'user[email]': 'edward@4angle.com', + 'user[password]': '8V8~9:3~U!Ly', + 'remember_me': 1, + 'next': 'Sign in', + 'n': re_n.search(page).group(1), + } + + print(token) + print(data['n']) + + r = s.post(sign_in_page, data=data, headers={'referer': sign_in_page}) + + open('signed_in.html', 'w').write(r.text) + + root = lxml.html.fromstring(r.content) + flash = root.find_class('flash') + if flash: + print('flash:', flash[0].text) + + cj.save(ignore_discard=True) + +def get_index(): + # url = 'https://www.goodreads.com/recommendations' + url = 'https://www.goodreads.com/recommendations/?recs_current_view=list' + + r = s.get(url) + open('recommendations.html', 'w').write(r.text) + +def get_individual(): + for line in open('recommendations.html'): + if 'actionLinkLite' not in line: + continue + m = re_recommend.match(line) + if m: + yield m.groups() + + +# art = 'https://www.goodreads.com/recommendations/genre/art' +login() +get_index() +recommend_list = list(get_individual()) +shuffle(recommend_list) + +headers = {'Accept': 'text/html'} + +for a, b, c in recommend_list: + print((b, c)) + url = 'https://www.goodreads.com' + a + + r = s.get(url, headers=headers) + filename = os.path.join(b, c + '.html') + open(filename, 'w').write(r.text) + sleep(0.5) diff --git a/parse.py b/parse.py new file mode 100755 index 0000000..56efc73 --- /dev/null +++ b/parse.py @@ -0,0 +1,131 @@ +#!/usr/bin/python3 +import os +import re +import lxml.html +import jinja2 +import json +import sys +import requests +from time import sleep +from pprint import pprint + +parser = lxml.html.HTMLParser(encoding='utf-8') + +env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')) + +re_book = re.compile('function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}', re.DOTALL) +re_book = re.compile('function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}', re.DOTALL) + +re_tip = re.compile(br'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') + +dirs = ['shelf', 'genre'] +start = 'https://www.goodreads.com/book/show/' + +existing = {book['title'] for book in json.load(open('calibre_book_list.json'))} + +def iter_books(): + for d in dirs: + for f in sorted(os.listdir(d)): + filename = os.path.join(d, f) + root = lxml.html.parse(filename, parser=parser).getroot() + for div in root.find_class('bookInformation'): + link = div.find('.//a') + rating = div.find('.//span[@class="minirating"]') + description = div.find('./div[@class="bookDescription"]') + r = rating[0].tail.strip() + + cover = div.getnext().find('.//img').get('src') + + book = { + 'title': link.text, + 'url': link.get('href'), + 'rating': r, + 'r': float(r[:3]), + 'cover': cover, + 'authors': [a.text for a in div.find_class('authorName')], + } + if description is not None: + index = 1 if len(description) == 3 else 0 + book['description'] = description[index].text + yield d, f, book + continue + + # print(filename) + for line in open(filename, 'rb'): + if b'var newTip' not in line: + continue + print(line) + m = re_tip.search(line) + tip = m.group(1).decode('unicode_escape').replace('\/', '/') + # tip = m.group(1) # .replace('\/', '/') + # print(tip) + if '