From 7b8cbfc9b958314d5a25b075dea032984b7fdd29 Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Mon, 9 Nov 2020 17:50:58 +1300 Subject: [PATCH] try to make feed only determined by the max age. --- apiserver/database.py | 9 ++++++--- apiserver/feed.py | 24 ++++++++++++++---------- apiserver/server.py | 14 ++++++++------ apiserver/settings.py.example | 2 ++ 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/apiserver/database.py b/apiserver/database.py index cada255..bdbd81d 100644 --- a/apiserver/database.py +++ b/apiserver/database.py @@ -1,4 +1,5 @@ import json +from datetime import datetime, timedelta from sqlalchemy import create_engine, Column, String, ForeignKey, Integer from sqlalchemy.ext.declarative import declarative_base @@ -66,16 +67,18 @@ def get_story_by_ref(ref): session = Session() return session.query(Story).filter(Story.ref==ref).first() -def get_reflist(amount): +def get_reflist(): session = Session() - q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount) + q = session.query(Reflist).order_by(Reflist.rid.desc()) return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] -def get_stories(amount): +def get_stories(maxage=60*60*24*2): + time = datetime.now().timestamp() - maxage session = Session() q = session.query(Reflist, Story.meta).\ join(Story).\ filter(Story.title != None).\ + filter(Story.meta['date'] > time).\ order_by(Story.meta['date'].desc()).\ limit(amount) return [x[1] for x in q] diff --git a/apiserver/feed.py b/apiserver/feed.py index 09a29d6..dd4bc2c 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -6,6 +6,7 @@ logging.basicConfig( import requests import time from bs4 import BeautifulSoup +import itertools import settings from feeds import hackernews, reddit, tildes, substack, manual, news @@ -27,36 +28,39 @@ sitemaps = {} for key, value in settings.SITEMAP.items(): sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) -def list(): - feed = [] +def get_list(): + feeds = {} + if settings.NUM_HACKERNEWS: - feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] + feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] if settings.NUM_REDDIT: - feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] + feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] if settings.NUM_TILDES: - feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] + feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] if settings.NUM_SUBSTACK: - feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] + feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] for key, publication in substacks.items(): count = settings.SUBSTACK[key]['count'] - feed += [(x, key) for x in publication.feed()[:count]] + feeds[key] = [(x, key) for x in publication.feed()[:count]] for key, sites in categories.items(): count = settings.CATEGORY[key].get('count') or 0 excludes = settings.CATEGORY[key].get('excludes') tz = settings.CATEGORY[key].get('tz') - feed += [(x, key) for x in sites.feed(excludes)[:count]] + feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] for key, sites in sitemaps.items(): count = settings.SITEMAP[key].get('count') or 0 excludes = settings.SITEMAP[key].get('excludes') - feed += [(x, key) for x in sites.feed(excludes)[:count]] - + feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] + values = feeds.values() + feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None)) + feed = list(filter(None, feed)) return feed def get_article(url): diff --git a/apiserver/server.py b/apiserver/server.py index 9a7a8a5..dfb4b26 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -15,6 +15,7 @@ import traceback import time from urllib.parse import urlparse, parse_qs +import settings import database import search import feed @@ -27,7 +28,7 @@ from flask_cors import CORS database.init() search.init() -FEED_LENGTH = 75 +news_length = 0 news_index = 0 def new_id(): @@ -42,7 +43,7 @@ cors = CORS(flask_app) @flask_app.route('/api') def api(): - stories = database.get_stories(FEED_LENGTH) + stories = database.get_stories(settings.MAX_STORY_AGE) res = Response(json.dumps({"stories": stories})) res.headers['content-type'] = 'application/json' return res @@ -145,13 +146,13 @@ def static_story(sid): http_server = WSGIServer(('', 33842), flask_app) def feed_thread(): - global news_index + global news_index, news_length try: while True: # onboard new stories if news_index == 0: - for ref, source in feed.list(): + for ref, source in feed.get_list(): if database.get_story_by_ref(ref): continue try: @@ -161,7 +162,8 @@ def feed_thread(): except database.IntegrityError: continue - ref_list = database.get_reflist(FEED_LENGTH) + ref_list = database.get_reflist() + news_length = len(ref_list) # update current stories if news_index < len(ref_list): @@ -187,7 +189,7 @@ def feed_thread(): gevent.sleep(6) news_index += 1 - if news_index == FEED_LENGTH: news_index = 0 + if news_index >= news_length: news_index = 0 except KeyboardInterrupt: logging.info('Ending feed thread...') diff --git a/apiserver/settings.py.example b/apiserver/settings.py.example index abb0b19..e571688 100644 --- a/apiserver/settings.py.example +++ b/apiserver/settings.py.example @@ -1,6 +1,8 @@ # QotNews settings # edit this file and save it as settings.py +MAX_STORY_AGE = 3*24*60*60 + # Feed Lengths # Number of top items from each site to pull # set to 0 to disable that site