Render reddit markdown, poll tildes better, add utils
This commit is contained in:
		| @@ -1,7 +1,7 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.INFO) | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| import requests | ||||
| import time | ||||
| @@ -13,9 +13,9 @@ READ_API = 'http://127.0.0.1:33843' | ||||
|  | ||||
| def list(): | ||||
|     feed = [] | ||||
|     feed += [(x, 'hackernews') for x in hackernews.feed()] | ||||
|     feed += [(x, 'reddit') for x in reddit.feed()] | ||||
|     feed += [(x, 'tildes') for x in tildes.feed()] | ||||
|     feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] | ||||
|     feed += [(x, 'reddit') for x in reddit.feed()[:5]] | ||||
|     feed += [(x, 'tildes') for x in tildes.feed()[:5]] | ||||
|     return feed | ||||
|  | ||||
| def get_article(url): | ||||
| @@ -69,3 +69,13 @@ def update_story(story): | ||||
|             story['text'] = get_article(story['url']) | ||||
|         else: | ||||
|             story['text'] = '<p>Unsupported article type.</p>' | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     test_news_cache = {} | ||||
|     nid = 'jean' | ||||
|     ref = 20802050 | ||||
|     source = 'hackernews' | ||||
|     test_news_cache[nid] = dict(id=nid, ref=ref, source=source) | ||||
|     news_story = test_news_cache[nid] | ||||
|     update_story(news_story) | ||||
|     print('done') | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.INFO) | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| import requests | ||||
|  | ||||
| @@ -15,14 +15,14 @@ def api(route, ref=None): | ||||
|     try: | ||||
|         r = requests.get(route(ref), timeout=5) | ||||
|         if r.status_code != 200: | ||||
|             raise | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem hitting hackernews API: {}'.format(str(e))) | ||||
|         return False | ||||
|  | ||||
| def feed(): | ||||
|     return api(API_TOPSTORIES)[:30] or [] | ||||
|     return api(API_TOPSTORIES) or [] | ||||
|  | ||||
| def comment(i): | ||||
|     c = {} | ||||
| @@ -62,6 +62,8 @@ def story(ref): | ||||
|  | ||||
|     return s | ||||
|  | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     print(feed()) | ||||
|     print(story(20763961)) | ||||
|     #print(feed()) | ||||
|     #print(story(20763961)) | ||||
|     print(story(20802050)) | ||||
|   | ||||
| @@ -1,11 +1,17 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.INFO) | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     import sys | ||||
|     sys.path.insert(0,'.') | ||||
|  | ||||
| import praw | ||||
| from praw.models import MoreComments | ||||
|  | ||||
| from utils import render_md | ||||
|  | ||||
| SUBREDDITS = 'Economics+Foodforthought+Futurology+TrueReddit+business+science+technology' | ||||
|  | ||||
| SITE_LINK = lambda x : 'https://old.reddit.com/{}'.format(x) | ||||
| @@ -14,7 +20,7 @@ SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x) | ||||
| reddit = praw.Reddit('bot') | ||||
|  | ||||
| def feed(): | ||||
|     return [x.id for x in reddit.subreddit(SUBREDDITS).hot(limit=30)] | ||||
|     return [x.id for x in reddit.subreddit(SUBREDDITS).hot()] | ||||
|  | ||||
| def good_comment(c): | ||||
|     if isinstance(c, MoreComments): | ||||
| @@ -30,7 +36,7 @@ def comment(i): | ||||
|     c['author'] = i.author.name if i.author else '[Deleted]' | ||||
|     c['score'] = i.score | ||||
|     c['date'] = i.created_utc | ||||
|     c['text'] = i.body.replace('\n', '<br />') | ||||
|     c['text'] = render_md(i.body) | ||||
|     c['comments'] = [comment(j) for j in i.replies if good_comment(j)] | ||||
|     return c | ||||
|  | ||||
| @@ -50,10 +56,11 @@ def story(ref): | ||||
|     s['num_comments'] = r.num_comments | ||||
|  | ||||
|     if r.selftext: | ||||
|         s['text'] = r.selftext | ||||
|         s['text'] = render_md(r.selftext) | ||||
|  | ||||
|     return s | ||||
|  | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     print(feed()) | ||||
|     print(reddit.submission(feed()[0]).permalink) | ||||
|   | ||||
| @@ -1,35 +1,40 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.INFO) | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| import requests | ||||
| from bs4 import BeautifulSoup | ||||
| from datetime import datetime | ||||
|  | ||||
| # cache the topic groups to prevent redirects | ||||
| group_lookup = {} | ||||
|  | ||||
| USER_AGENT = 'qotnews scraper (github:tannercollin)' | ||||
|  | ||||
| API_TOPSTORIES = lambda x: 'https://tildes.net' | ||||
| API_ITEM = lambda x : 'https://tildes.net/~qotnews/{}/'.format(x) | ||||
| API_TOPSTORIES = lambda : 'https://tildes.net' | ||||
| API_ITEM = lambda x : 'https://tildes.net/shortener/{}'.format(x) | ||||
|  | ||||
| SITE_LINK = lambda x : 'https://tildes.net/~qotnews/{}/'.format(x) | ||||
| SITE_LINK = lambda group, ref : 'https://tildes.net/{}/{}'.format(group, ref) | ||||
| SITE_AUTHOR_LINK = lambda x : 'https://tildes.net/user/{}'.format(x) | ||||
|  | ||||
| def api(route, ref=None): | ||||
| def api(route): | ||||
|     try: | ||||
|         headers = {'User-Agent': USER_AGENT} | ||||
|         r = requests.get(route(ref), headers=headers, timeout=5) | ||||
|         r = requests.get(route, headers=headers, timeout=5) | ||||
|         if r.status_code != 200: | ||||
|             raise | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.text | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem hitting tildes website: {}'.format(str(e))) | ||||
|         return False | ||||
|  | ||||
| def feed(): | ||||
|     soup = BeautifulSoup(api(API_TOPSTORIES), features='html.parser') | ||||
|     html = api(API_TOPSTORIES()) | ||||
|     if not html: return [] | ||||
|     soup = BeautifulSoup(html, features='html.parser') | ||||
|     articles = soup.find('ol', class_='topic-listing').findAll('article') | ||||
|     return [x['id'].split('-')[1] for x in articles][:30] or [] | ||||
|     return [x['id'].split('-')[1] for x in articles] or [] | ||||
|  | ||||
| def unix(date_str): | ||||
|     return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp()) | ||||
| @@ -46,9 +51,14 @@ def comment(i): | ||||
|     return c | ||||
|  | ||||
| def story(ref): | ||||
|     html = api(API_ITEM, ref) | ||||
|     if ref in group_lookup: | ||||
|         html = api(SITE_LINK(group_lookup[ref], ref)) | ||||
|     else: | ||||
|         html = api(API_ITEM(ref)) | ||||
|     if not html: return False | ||||
|  | ||||
|     if 'Topic deleted by author' in html: return False | ||||
|  | ||||
|     soup = BeautifulSoup(html, features='html.parser') | ||||
|     a = soup.find('article', class_='topic-full') | ||||
|     h = a.find('header') | ||||
| @@ -59,7 +69,9 @@ def story(ref): | ||||
|     s['score'] = int(h.find('span', class_='topic-voting-votes').string) | ||||
|     s['date'] = unix(h.find('time')['datetime']) | ||||
|     s['title'] = str(h.h1.string) | ||||
|     s['link'] = SITE_LINK(ref) | ||||
|     s['group'] = str(soup.find('a', class_='site-header-context').string) | ||||
|     group_lookup[ref] = s['group'] | ||||
|     s['link'] = SITE_LINK(s['group'], ref) | ||||
|     ud = a.find('div', class_='topic-full-link') | ||||
|     s['url'] = ud.a['href'] if ud else s['link'] | ||||
|     sc = a.find('ol', id='comments') | ||||
| @@ -73,6 +85,7 @@ def story(ref): | ||||
|  | ||||
|     return s | ||||
|  | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     print(feed()) | ||||
|     normal = story('gxt') | ||||
|   | ||||
| @@ -6,11 +6,10 @@ logging.basicConfig( | ||||
| import copy | ||||
| import threading | ||||
| import time | ||||
| import random | ||||
| import shelve | ||||
| import string | ||||
|  | ||||
| import feed | ||||
| from utils import gen_rand_id | ||||
|  | ||||
| from flask import abort, Flask, request | ||||
| from flask_cors import CORS | ||||
| @@ -26,6 +25,7 @@ with shelve.open(DATA_FILE) as db: | ||||
|     news_ref_to_id = db.get('news_ref_to_id', {}) | ||||
|     news_cache = db.get('news_cache', {}) | ||||
|  | ||||
|  | ||||
| flask_app = Flask(__name__) | ||||
| cors = CORS(flask_app) | ||||
|  | ||||
| @@ -54,9 +54,6 @@ web_thread = threading.Thread(target=flask_app.run, kwargs={'port': 33842}) | ||||
| web_thread.setDaemon(True) | ||||
| web_thread.start() | ||||
|  | ||||
| def gen_rand_id(): | ||||
|     return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) | ||||
|  | ||||
| def new_id(): | ||||
|     nid = gen_rand_id() | ||||
|     with shelve.open(DATA_FILE) as db: | ||||
| @@ -92,7 +89,7 @@ try: | ||||
|             news_story = news_cache[update_id] | ||||
|             feed.update_story(news_story) | ||||
|  | ||||
|         time.sleep(1) | ||||
|         time.sleep(3) | ||||
|  | ||||
|         news_index += 1 | ||||
|         if news_index == CACHE_LENGTH: news_index = 0 | ||||
|   | ||||
							
								
								
									
										17
									
								
								apiserver/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								apiserver/utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| import commonmark | ||||
| import random | ||||
| import string | ||||
|  | ||||
| def gen_rand_id(): | ||||
|     return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) | ||||
|  | ||||
| def render_md(md): | ||||
|     if md: | ||||
|         return commonmark.commonmark(md) | ||||
|     else: | ||||
|         return '' | ||||
		Reference in New Issue
	
	Block a user