From 23cdbc92924a4ec01a26669e2645dd010e7712f2 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Wed, 28 Aug 2019 04:13:02 +0000 Subject: [PATCH] Render reddit markdown, poll tildes better, add utils --- apiserver/feed.py | 18 ++++++++++++++---- apiserver/feeds/hackernews.py | 12 +++++++----- apiserver/feeds/reddit.py | 15 +++++++++++---- apiserver/feeds/tildes.py | 35 ++++++++++++++++++++++++----------- apiserver/server.py | 9 +++------ apiserver/utils.py | 17 +++++++++++++++++ 6 files changed, 76 insertions(+), 30 deletions(-) create mode 100644 apiserver/utils.py diff --git a/apiserver/feed.py b/apiserver/feed.py index 03a193b..7a42ac9 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -1,7 +1,7 @@ import logging logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.INFO) + level=logging.DEBUG) import requests import time @@ -13,9 +13,9 @@ READ_API = 'http://127.0.0.1:33843' def list(): feed = [] - feed += [(x, 'hackernews') for x in hackernews.feed()] - feed += [(x, 'reddit') for x in reddit.feed()] - feed += [(x, 'tildes') for x in tildes.feed()] + feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] + feed += [(x, 'reddit') for x in reddit.feed()[:5]] + feed += [(x, 'tildes') for x in tildes.feed()[:5]] return feed def get_article(url): @@ -69,3 +69,13 @@ def update_story(story): story['text'] = get_article(story['url']) else: story['text'] = '

Unsupported article type.

' + +if __name__ == '__main__': + test_news_cache = {} + nid = 'jean' + ref = 20802050 + source = 'hackernews' + test_news_cache[nid] = dict(id=nid, ref=ref, source=source) + news_story = test_news_cache[nid] + update_story(news_story) + print('done') diff --git a/apiserver/feeds/hackernews.py b/apiserver/feeds/hackernews.py index 712f940..74b9cb5 100644 --- a/apiserver/feeds/hackernews.py +++ b/apiserver/feeds/hackernews.py @@ -1,7 +1,7 @@ import logging logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.INFO) + level=logging.DEBUG) import requests @@ -15,14 +15,14 @@ def api(route, ref=None): try: r = requests.get(route(ref), timeout=5) if r.status_code != 200: - raise + raise Exception('Bad response code ' + str(r.status_code)) return r.json() except BaseException as e: logging.error('Problem hitting hackernews API: {}'.format(str(e))) return False def feed(): - return api(API_TOPSTORIES)[:30] or [] + return api(API_TOPSTORIES) or [] def comment(i): c = {} @@ -62,6 +62,8 @@ def story(ref): return s +# scratchpad so I can quickly develop the parser if __name__ == '__main__': - print(feed()) - print(story(20763961)) + #print(feed()) + #print(story(20763961)) + print(story(20802050)) diff --git a/apiserver/feeds/reddit.py b/apiserver/feeds/reddit.py index 1a5ba5e..29f8aaf 100644 --- a/apiserver/feeds/reddit.py +++ b/apiserver/feeds/reddit.py @@ -1,11 +1,17 @@ import logging logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.INFO) + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') import praw from praw.models import MoreComments +from utils import render_md + SUBREDDITS = 'Economics+Foodforthought+Futurology+TrueReddit+business+science+technology' SITE_LINK = lambda x : 'https://old.reddit.com/{}'.format(x) @@ -14,7 +20,7 @@ SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x) reddit = praw.Reddit('bot') def feed(): - return [x.id for x in reddit.subreddit(SUBREDDITS).hot(limit=30)] + return [x.id for x in reddit.subreddit(SUBREDDITS).hot()] def good_comment(c): if isinstance(c, MoreComments): @@ -30,7 +36,7 @@ def comment(i): c['author'] = i.author.name if i.author else '[Deleted]' c['score'] = i.score c['date'] = i.created_utc - c['text'] = i.body.replace('\n', '
') + c['text'] = render_md(i.body) c['comments'] = [comment(j) for j in i.replies if good_comment(j)] return c @@ -50,10 +56,11 @@ def story(ref): s['num_comments'] = r.num_comments if r.selftext: - s['text'] = r.selftext + s['text'] = render_md(r.selftext) return s +# scratchpad so I can quickly develop the parser if __name__ == '__main__': print(feed()) print(reddit.submission(feed()[0]).permalink) diff --git a/apiserver/feeds/tildes.py b/apiserver/feeds/tildes.py index c0c8b51..8154dbe 100644 --- a/apiserver/feeds/tildes.py +++ b/apiserver/feeds/tildes.py @@ -1,35 +1,40 @@ import logging logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.INFO) + level=logging.DEBUG) import requests from bs4 import BeautifulSoup from datetime import datetime +# cache the topic groups to prevent redirects +group_lookup = {} + USER_AGENT = 'qotnews scraper (github:tannercollin)' -API_TOPSTORIES = lambda x: 'https://tildes.net' -API_ITEM = lambda x : 'https://tildes.net/~qotnews/{}/'.format(x) +API_TOPSTORIES = lambda : 'https://tildes.net' +API_ITEM = lambda x : 'https://tildes.net/shortener/{}'.format(x) -SITE_LINK = lambda x : 'https://tildes.net/~qotnews/{}/'.format(x) +SITE_LINK = lambda group, ref : 'https://tildes.net/{}/{}'.format(group, ref) SITE_AUTHOR_LINK = lambda x : 'https://tildes.net/user/{}'.format(x) -def api(route, ref=None): +def api(route): try: headers = {'User-Agent': USER_AGENT} - r = requests.get(route(ref), headers=headers, timeout=5) + r = requests.get(route, headers=headers, timeout=5) if r.status_code != 200: - raise + raise Exception('Bad response code ' + str(r.status_code)) return r.text except BaseException as e: logging.error('Problem hitting tildes website: {}'.format(str(e))) return False def feed(): - soup = BeautifulSoup(api(API_TOPSTORIES), features='html.parser') + html = api(API_TOPSTORIES()) + if not html: return [] + soup = BeautifulSoup(html, features='html.parser') articles = soup.find('ol', class_='topic-listing').findAll('article') - return [x['id'].split('-')[1] for x in articles][:30] or [] + return [x['id'].split('-')[1] for x in articles] or [] def unix(date_str): return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp()) @@ -46,9 +51,14 @@ def comment(i): return c def story(ref): - html = api(API_ITEM, ref) + if ref in group_lookup: + html = api(SITE_LINK(group_lookup[ref], ref)) + else: + html = api(API_ITEM(ref)) if not html: return False + if 'Topic deleted by author' in html: return False + soup = BeautifulSoup(html, features='html.parser') a = soup.find('article', class_='topic-full') h = a.find('header') @@ -59,7 +69,9 @@ def story(ref): s['score'] = int(h.find('span', class_='topic-voting-votes').string) s['date'] = unix(h.find('time')['datetime']) s['title'] = str(h.h1.string) - s['link'] = SITE_LINK(ref) + s['group'] = str(soup.find('a', class_='site-header-context').string) + group_lookup[ref] = s['group'] + s['link'] = SITE_LINK(s['group'], ref) ud = a.find('div', class_='topic-full-link') s['url'] = ud.a['href'] if ud else s['link'] sc = a.find('ol', id='comments') @@ -73,6 +85,7 @@ def story(ref): return s +# scratchpad so I can quickly develop the parser if __name__ == '__main__': print(feed()) normal = story('gxt') diff --git a/apiserver/server.py b/apiserver/server.py index 7fc9141..7acb63b 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -6,11 +6,10 @@ logging.basicConfig( import copy import threading import time -import random import shelve -import string import feed +from utils import gen_rand_id from flask import abort, Flask, request from flask_cors import CORS @@ -26,6 +25,7 @@ with shelve.open(DATA_FILE) as db: news_ref_to_id = db.get('news_ref_to_id', {}) news_cache = db.get('news_cache', {}) + flask_app = Flask(__name__) cors = CORS(flask_app) @@ -54,9 +54,6 @@ web_thread = threading.Thread(target=flask_app.run, kwargs={'port': 33842}) web_thread.setDaemon(True) web_thread.start() -def gen_rand_id(): - return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) - def new_id(): nid = gen_rand_id() with shelve.open(DATA_FILE) as db: @@ -92,7 +89,7 @@ try: news_story = news_cache[update_id] feed.update_story(news_story) - time.sleep(1) + time.sleep(3) news_index += 1 if news_index == CACHE_LENGTH: news_index = 0 diff --git a/apiserver/utils.py b/apiserver/utils.py new file mode 100644 index 0000000..ac1cf59 --- /dev/null +++ b/apiserver/utils.py @@ -0,0 +1,17 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +import commonmark +import random +import string + +def gen_rand_id(): + return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) + +def render_md(md): + if md: + return commonmark.commonmark(md) + else: + return ''