Merge remote-tracking branch 'tanner/master'

This commit is contained in:
Jason Schwarzenberger 2020-12-16 11:31:01 +13:00
commit a6e1644ddf
8 changed files with 136 additions and 11 deletions

View File

@ -109,4 +109,5 @@ settings.py
data.db data.db
data.db.bak data.db.bak
data/archive/* data/archive/*
data/backup/*
qotnews.sqlite qotnews.sqlite

View File

@ -5,7 +5,7 @@ from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.types import JSON from sqlalchemy.types import JSON
engine = create_engine('sqlite:///data/qotnews.sqlite') engine = create_engine('sqlite:///data/qotnews.sqlite', connect_args={'timeout': 120})
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
Base = declarative_base() Base = declarative_base()

View File

@ -9,13 +9,13 @@ from bs4 import BeautifulSoup
import itertools import itertools
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual from feeds import hackernews, reddit, tildes, substack, manual, lobsters
from feeds.sitemap import Sitemap from feeds.sitemap import Sitemap
from feeds.category import Category from feeds.category import Category
from scrapers import outline from scrapers import outline
from scrapers.declutter import declutter, headless, simple from scrapers.declutter import declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
@ -33,6 +33,9 @@ def get_list():
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_LOBSTERS:
feed += [(x, 'lobsters', x) for x in lobsters.feed()[:settings.NUM_LOBSTERS]]
if settings.NUM_REDDIT: if settings.NUM_REDDIT:
feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]] feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
@ -107,6 +110,8 @@ def update_story(story, is_manual=False, urlref=None):
if story['source'] == 'hackernews': if story['source'] == 'hackernews':
res = hackernews.story(story['ref']) res = hackernews.story(story['ref'])
elif story['source'] == 'lobsters':
res = lobsters.story(story['ref'])
elif story['source'] == 'reddit': elif story['source'] == 'reddit':
res = reddit.story(story['ref']) res = reddit.story(story['ref'])
elif story['source'] == 'tildes': elif story['source'] == 'tildes':

113
apiserver/feeds/lobsters.py Normal file
View File

@ -0,0 +1,113 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from utils import clean
API_HOTTEST = lambda x: 'https://lobste.rs/hottest.json'
API_ITEM = lambda x : 'https://lobste.rs/s/{}.json'.format(x)
SITE_LINK = lambda x : 'https://lobste.rs/s/{}'.format(x)
SITE_AUTHOR_LINK = lambda x : 'https://lobste.rs/u/{}'.format(x)
def api(route, ref=None):
try:
r = requests.get(route(ref), timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting lobsters API: {}, trying again'.format(str(e)))
try:
r = requests.get(route(ref), timeout=15)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting lobsters API: {}'.format(str(e)))
return False
def feed():
return [x['short_id'] for x in api(API_HOTTEST) or []]
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp())
def make_comment(i):
c = {}
try:
c['author'] = i['commenting_user']['username']
except KeyError:
c['author'] = ''
c['score'] = i.get('score', 0)
try:
c['date'] = unix(i['created_at'])
except KeyError:
c['date'] = 0
c['text'] = clean(i.get('comment', '') or '')
c['comments'] = []
return c
def iter_comments(flat_comments):
nested_comments = []
parent_stack = []
for comment in flat_comments:
c = make_comment(comment)
indent = comment['indent_level']
if indent == 1:
nested_comments.append(c)
parent_stack = [c]
else:
parent_stack = parent_stack[:indent-1]
p = parent_stack[-1]
p['comments'].append(c)
parent_stack.append(c)
return nested_comments
def story(ref):
r = api(API_ITEM, ref)
if not r: return False
s = {}
try:
s['author'] = r['submitter_user']['username']
s['author_link'] = SITE_AUTHOR_LINK(s['author'])
except KeyError:
s['author'] = ''
s['author_link'] = ''
s['score'] = r.get('score', 0)
try:
s['date'] = unix(r['created_at'])
except KeyError:
s['date'] = 0
s['title'] = r.get('title', '')
s['link'] = SITE_LINK(ref)
s['url'] = r.get('url', '')
s['comments'] = iter_comments(r['comments'])
s['num_comments'] = r['comment_count']
if 'description' in r and r['description']:
s['text'] = clean(r['description'] or '')
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
#print(feed())
import json
print(json.dumps(story('fzvd1v')))
#print(story(20802050))

View File

@ -72,6 +72,9 @@ def submit():
elif 'tildes.net' in parse.hostname and '~' in url: elif 'tildes.net' in parse.hostname and '~' in url:
source = 'tildes' source = 'tildes'
ref = parse.path.split('/')[2] ref = parse.path.split('/')[2]
elif 'lobste.rs' in parse.hostname and '/s/' in url:
source = 'lobsters'
ref = parse.path.split('/')[2]
elif 'reddit.com' in parse.hostname and 'comments' in url: elif 'reddit.com' in parse.hostname and 'comments' in url:
source = 'reddit' source = 'reddit'
ref = parse.path.split('/')[4] ref = parse.path.split('/')[4]
@ -120,7 +123,7 @@ def index():
return render_template('index.html', return render_template('index.html',
title='Feed', title='Feed',
url=settings.HOSTNAME, url=settings.HOSTNAME,
description='Reddit, Hacker News, and Tildes combined, then pre-rendered in reader mode') description='Hacker News, Reddit, Lobsters, and Tildes articles rendered in reader mode')
@flask_app.route('/<sid>', strict_slashes=False) @flask_app.route('/<sid>', strict_slashes=False)
@flask_app.route('/<sid>/c', strict_slashes=False) @flask_app.route('/<sid>/c', strict_slashes=False)

View File

@ -13,6 +13,7 @@ HEADLESS_READER_PORT = 33843
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site
NUM_HACKERNEWS = 15 NUM_HACKERNEWS = 15
NUM_LOBSTERS = 10
NUM_REDDIT = 10 NUM_REDDIT = 10
NUM_TILDES = 5 NUM_TILDES = 5
NUM_SUBSTACK = 10 NUM_SUBSTACK = 10
@ -74,8 +75,6 @@ SUBREDDITS = [
'HistoryofIdeas', 'HistoryofIdeas',
'LaymanJournals', 'LaymanJournals',
'PhilosophyofScience', 'PhilosophyofScience',
'PoliticsPDFs',
'Scholar',
'StateOfTheUnion', 'StateOfTheUnion',
'TheAgora', 'TheAgora',
'TrueFilm', 'TrueFilm',
@ -89,4 +88,7 @@ SUBREDDITS = [
'neurophilosophy', 'neurophilosophy',
'resilientcommunities', 'resilientcommunities',
'worldevents', 'worldevents',
'StallmanWasRight',
'DarkFuturology',
'EverythingScience',
] ]

View File

@ -66,7 +66,7 @@ class App extends React.Component {
<Link to='/'>QotNews - Feed</Link> <Link to='/'>QotNews - Feed</Link>
<span className='theme'>Theme: <a href='#' onClick={() => this.light()}>Light</a> - <a href='#' onClick={() => this.dark()}>Dark</a></span> <span className='theme'>Theme: <a href='#' onClick={() => this.light()}>Light</a> - <a href='#' onClick={() => this.dark()}>Dark</a></span>
<br /> <br />
<span className='slogan'>Reddit, Hacker News, and Tildes combined, then pre-rendered in reader mode.</span> <span className='slogan'>Hacker News, Reddit, Lobsters, and Tildes articles rendered in reader mode.</span>
</p> </p>
<Route path='/(|search)' component={Search} /> <Route path='/(|search)' component={Search} />
<Route path='/(|search)' component={Submit} /> <Route path='/(|search)' component={Submit} />

File diff suppressed because one or more lines are too long