diff --git a/apiserver/feed.py b/apiserver/feed.py index 60db037..780d826 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -7,19 +7,20 @@ import requests import time from bs4 import BeautifulSoup -from feeds import hackernews, reddit, tildes, manual +from feeds import hackernews, reddit, tildes, webworm, manual OUTLINE_API = 'https://api.outline.com/v3/parse_article' READ_API = 'http://127.0.0.1:33843' INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] -TWO_DAYS = 60*60*24*2 +TWO_DAYS = 60*60*24*10 def list(): feed = [] - feed += [(x, 'hackernews') for x in hackernews.feed()[:15]] - feed += [(x, 'reddit') for x in reddit.feed()[:10]] - feed += [(x, 'tildes') for x in tildes.feed()[:5]] + feed += [(x, 'reddit') for x in reddit.feed()[:15]] + feed += [(x, 'webworm') for x in webworm.feed()[:15]] + feed += [(x, 'tildes') for x in tildes.feed()[:10]] + feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] return feed def get_article(url): @@ -77,6 +78,8 @@ def update_story(story, is_manual=False): res = reddit.story(story['ref']) elif story['source'] == 'tildes': res = tildes.story(story['ref']) + elif story['source'] == 'webworm': + res = webworm.story(story['ref']) elif story['source'] == 'manual': res = manual.story(story['ref']) diff --git a/apiserver/feeds/reddit.py b/apiserver/feeds/reddit.py index 9b754cd..d7baf0b 100644 --- a/apiserver/feeds/reddit.py +++ b/apiserver/feeds/reddit.py @@ -14,7 +14,7 @@ from prawcore.exceptions import PrawcoreException from utils import render_md, clean -SUBREDDITS = 'Economics+AcademicPhilosophy+DepthHub+Foodforthought+HistoryofIdeas+LaymanJournals+PhilosophyofScience+PoliticsPDFs+Scholar+StateOfTheUnion+TheAgora+TrueFilm+TrueReddit+UniversityofReddit+culturalstudies+hardscience+indepthsports+indepthstories+ludology+neurophilosophy+resilientcommunities+worldevents' +SUBREDDITS = 'newzealand' SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x) SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x) diff --git a/apiserver/feeds/webworm.py b/apiserver/feeds/webworm.py new file mode 100644 index 0000000..51db9b0 --- /dev/null +++ b/apiserver/feeds/webworm.py @@ -0,0 +1,114 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +import requests + +from utils import clean + +WEBWORM_DOMAIN = "https://www.webworm.co" + +API_STORIES = lambda x: f'{WEBWORM_DOMAIN}/api/v1/archive?sort=new&search=&offset=0&limit=100' +#API_ITEM = lambda x : f'https://hn.algolia.com/api/v1/items/{x}' +API_ITEM_COMMENTS = lambda x: f"{WEBWORM_DOMAIN}/api/v1/post/{x}/comments?all_comments=true&sort=best_first" + +SITE_LINK = lambda x: f'{WEBWORM_DOMAIN}/p/{x}' +SITE_AUTHOR_LINK = lambda x : f'{WEBWORM_DOMAIN}/people/{x}' + +def api(route, ref=None): + try: + r = requests.get(route(ref), timeout=5) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem hitting Substack API: {}, trying again'.format(str(e))) + + try: + r = requests.get(route(ref), timeout=15) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem hitting Substack API: {}'.format(str(e))) + return False + +def feed(): + stories = api(API_STORIES) + stories = list(filter(None, [None if i.get("audience") == "only_paid" else i for i in stories])) + return [str(i.get("id")) for i in stories or []] + +def bylines(b): + if 'id' not in b: + return None + a = {} + a['name'] = b.get('name') + a['link'] = SITE_AUTHOR_LINK(b.get('id')) + return a + +def comment(i): + if 'body' not in i: + return False + + c = {} + c['author'] = i.get('name', '') + c['score'] = o.get('reactions').get('❤') + c['date'] = i.get('created_at_i', 0) + c['text'] = clean(i.get('body', '') or '') + c['comments'] = [comment(j) for j in i['children']] + c['comments'] = list(filter(bool, c['comments'])) + return c + +def comment_count(i): + alive = 1 if i['author'] else 0 + return sum([comment_count(c) for c in i['comments']]) + alive + +def story(ref): + stories = api(API_STORIES) + stories = list(filter(None, [None if i.get("audience") == "only_paid" else i for i in stories])) + stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) + + if len(stories) == 0: + print("no items") + return False + + r = stories[0] + if not r: + print("not r") + return False + + s = {} + authors = list(filter(None, [bylines(byline) for byline in r.get('publishedBylines')])) + s['author'] = '' + s['author_link'] = '' + if len(authors): + s['author'] = authors[0].get('name') + s['author_link'] = authors[0].get('link') + s['score'] = r.get('reactions').get('❤') + s['date'] = r.get('post_date', 0) + s['title'] = r.get('title', '') + s['link'] = r.get('canonical_url', '') + s['url'] = r.get('canonical_url', '') + s['comments'] = [comment(i) for i in api(API_ITEM_COMMENTS, r.get('id'))] + s['comments'] = list(filter(bool, s['comments'])) + s['num_comments'] = r.get('comment_count', 0) + + if 'text' in r and r['text']: + s['text'] = clean(r['text'] or '') + + return s + +# scratchpad so I can quickly develop the parser +if __name__ == '__main__': + stories = feed() + print(stories) + print(story(stories[0]))