Compare commits

...

5 Commits

Author SHA1 Message Date
Jason Schwarzenberger 9cee370a25 tvnz icon 4 years ago
Jason Schwarzenberger 5efc6ef2d3 add related stories (in api only) 4 years ago
Jason Schwarzenberger 4ec50e20cb feed thread loop. 4 years ago
Jason Schwarzenberger c1b7877f4b remove limit. 4 years ago
Jason Schwarzenberger 7b8cbfc9b9 try to make feed only determined by the max age. 4 years ago
  1. 21
      apiserver/database.py
  2. 30
      apiserver/feed.py
  3. 2
      apiserver/feeds/news.py
  4. 80
      apiserver/server.py
  5. 2
      apiserver/settings.py.example
  6. 1
      webclient/src/utils.js

@ -1,5 +1,4 @@
import json from datetime import datetime, timedelta
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
@ -66,18 +65,26 @@ def get_story_by_ref(ref):
session = Session() session = Session()
return session.query(Story).filter(Story.ref==ref).first() return session.query(Story).filter(Story.ref==ref).first()
def get_reflist(amount): def get_stories_by_url(url):
session = Session()
return session.query(Story).\
filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_reflist():
session = Session() session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount) q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
def get_stories(amount): def get_stories(maxage=60*60*24*2):
time = datetime.now().timestamp() - maxage
session = Session() session = Session()
q = session.query(Reflist, Story.meta).\ q = session.query(Reflist, Story.meta).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
order_by(Story.meta['date'].desc()).\ filter(Story.meta['date'] > time).\
limit(amount) order_by(Story.meta['date'].desc())
return [x[1] for x in q] return [x[1] for x in q]
def put_ref(ref, sid, source): def put_ref(ref, sid, source):

@ -6,16 +6,13 @@ logging.basicConfig(
import requests import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import itertools
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local from scrapers import outline, declutter, local
ONE_HOUR = 60*60
ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
@ -27,36 +24,39 @@ sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def list(): def get_list():
feed = [] feeds = {}
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT: if settings.NUM_REDDIT:
feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES: if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK: if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count'] count = settings.SUBSTACK[key]['count']
feed += [(x, key) for x in publication.feed()[:count]] feeds[key] = [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0 count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes') excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz') tz = settings.CATEGORY[key].get('tz')
feed += [(x, key) for x in sites.feed(excludes)[:count]] feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0 count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes') excludes = settings.SITEMAP[key].get('excludes')
feed += [(x, key) for x in sites.feed(excludes)[:count]] feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed return feed
def get_article(url): def get_article(url):
@ -124,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

@ -163,6 +163,8 @@ def get_sitemap_date(a):
return a.find('lastmod').text return a.find('lastmod').text
if a.find('news:publication_date'): if a.find('news:publication_date'):
return a.find('news:publication_date').text return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return '' return ''
class Sitemap(_Base): class Sitemap(_Base):

@ -15,6 +15,7 @@ import traceback
import time import time
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import settings
import database import database
import search import search
import feed import feed
@ -27,9 +28,6 @@ from flask_cors import CORS
database.init() database.init()
search.init() search.init()
FEED_LENGTH = 75
news_index = 0
def new_id(): def new_id():
nid = gen_rand_id() nid = gen_rand_id()
while database.get_story(nid): while database.get_story(nid):
@ -42,7 +40,7 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(FEED_LENGTH) stories = database.get_stories(settings.MAX_STORY_AGE)
res = Response(json.dumps({"stories": stories})) res = Response(json.dumps({"stories": stories}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -101,7 +99,9 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
res = Response(json.dumps({"story": story.data})) related = database.get_stories_by_url(story.meta['url'])
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -144,51 +144,49 @@ def static_story(sid):
http_server = WSGIServer(('', 33842), flask_app) http_server = WSGIServer(('', 33842), flask_app)
def feed_thread(): def _add_new_refs():
global news_index for ref, source in feed.get_list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
def _update_current_story(item):
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
def feed_thread():
ref_list = []
try: try:
while True: while True:
# onboard new stories # onboard new stories
if news_index == 0: if not len(ref_list):
for ref, source in feed.list(): _add_new_refs()
if database.get_story_by_ref(ref): ref_list = database.get_reflist()
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
ref_list = database.get_reflist(FEED_LENGTH)
# update current stories # update current stories
if news_index < len(ref_list): if len(ref_list):
item = ref_list[news_index] item = ref_list.pop(0)
_update_current_story(item)
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
else:
logging.info('Skipping index: ' + str(news_index))
gevent.sleep(6) gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt: except KeyboardInterrupt:
logging.info('Ending feed thread...') logging.info('Ending feed thread...')
except ValueError as e: except ValueError as e:

@ -1,6 +1,8 @@
# QotNews settings # QotNews settings
# edit this file and save it as settings.py # edit this file and save it as settings.py
MAX_STORY_AGE = 3*24*60*60
# Feed Lengths # Feed Lengths
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site

@ -72,5 +72,6 @@ export const logos = {
stuff: "", stuff: "",
substack: "", substack: "",
"the bulletin": "", "the bulletin": "",
tvnz: "",
webworm: "", webworm: "",
}; };

Loading…
Cancel
Save