Compare commits

...

5 Commits

Author SHA1 Message Date
Jason Schwarzenberger
9cee370a25 tvnz icon 2020-11-10 14:10:02 +13:00
Jason Schwarzenberger
5efc6ef2d3 add related stories (in api only) 2020-11-10 14:09:56 +13:00
Jason Schwarzenberger
4ec50e20cb feed thread loop. 2020-11-10 10:10:38 +13:00
Jason Schwarzenberger
c1b7877f4b remove limit. 2020-11-09 17:54:50 +13:00
Jason Schwarzenberger
7b8cbfc9b9 try to make feed only determined by the max age. 2020-11-09 17:50:58 +13:00
6 changed files with 73 additions and 63 deletions

View File

@ -1,5 +1,4 @@
import json from datetime import datetime, timedelta
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
@ -66,18 +65,26 @@ def get_story_by_ref(ref):
session = Session() session = Session()
return session.query(Story).filter(Story.ref==ref).first() return session.query(Story).filter(Story.ref==ref).first()
def get_reflist(amount): def get_stories_by_url(url):
session = Session() session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount) return session.query(Story).\
filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_reflist():
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
def get_stories(amount): def get_stories(maxage=60*60*24*2):
time = datetime.now().timestamp() - maxage
session = Session() session = Session()
q = session.query(Reflist, Story.meta).\ q = session.query(Reflist, Story.meta).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
order_by(Story.meta['date'].desc()).\ filter(Story.meta['date'] > time).\
limit(amount) order_by(Story.meta['date'].desc())
return [x[1] for x in q] return [x[1] for x in q]
def put_ref(ref, sid, source): def put_ref(ref, sid, source):

View File

@ -6,16 +6,13 @@ logging.basicConfig(
import requests import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import itertools
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local from scrapers import outline, declutter, local
ONE_HOUR = 60*60
ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
@ -27,36 +24,39 @@ sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def list(): def get_list():
feed = [] feeds = {}
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT: if settings.NUM_REDDIT:
feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES: if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK: if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count'] count = settings.SUBSTACK[key]['count']
feed += [(x, key) for x in publication.feed()[:count]] feeds[key] = [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0 count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes') excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz') tz = settings.CATEGORY[key].get('tz')
feed += [(x, key) for x in sites.feed(excludes)[:count]] feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0 count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes') excludes = settings.SITEMAP[key].get('excludes')
feed += [(x, key) for x in sites.feed(excludes)[:count]] feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed return feed
def get_article(url): def get_article(url):
@ -124,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

View File

@ -163,6 +163,8 @@ def get_sitemap_date(a):
return a.find('lastmod').text return a.find('lastmod').text
if a.find('news:publication_date'): if a.find('news:publication_date'):
return a.find('news:publication_date').text return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return '' return ''
class Sitemap(_Base): class Sitemap(_Base):

View File

@ -15,6 +15,7 @@ import traceback
import time import time
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import settings
import database import database
import search import search
import feed import feed
@ -27,9 +28,6 @@ from flask_cors import CORS
database.init() database.init()
search.init() search.init()
FEED_LENGTH = 75
news_index = 0
def new_id(): def new_id():
nid = gen_rand_id() nid = gen_rand_id()
while database.get_story(nid): while database.get_story(nid):
@ -42,7 +40,7 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(FEED_LENGTH) stories = database.get_stories(settings.MAX_STORY_AGE)
res = Response(json.dumps({"stories": stories})) res = Response(json.dumps({"stories": stories}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -101,7 +99,9 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
res = Response(json.dumps({"story": story.data})) related = database.get_stories_by_url(story.meta['url'])
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -144,14 +144,8 @@ def static_story(sid):
http_server = WSGIServer(('', 33842), flask_app) http_server = WSGIServer(('', 33842), flask_app)
def feed_thread(): def _add_new_refs():
global news_index for ref, source in feed.get_list():
try:
while True:
# onboard new stories
if news_index == 0:
for ref, source in feed.list():
if database.get_story_by_ref(ref): if database.get_story_by_ref(ref):
continue continue
try: try:
@ -161,18 +155,13 @@ def feed_thread():
except database.IntegrityError: except database.IntegrityError:
continue continue
ref_list = database.get_reflist(FEED_LENGTH) def _update_current_story(item):
# update current stories
if news_index < len(ref_list):
item = ref_list[news_index]
try: try:
story = database.get_story(item['sid']).data story = database.get_story(item['sid']).data
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index)) logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story) valid = feed.update_story(story)
if valid: if valid:
@ -181,14 +170,23 @@ def feed_thread():
else: else:
database.del_ref(item['ref']) database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref'])) logging.info('Removed ref {}'.format(item['ref']))
else:
logging.info('Skipping index: ' + str(news_index)) def feed_thread():
ref_list = []
try:
while True:
# onboard new stories
if not len(ref_list):
_add_new_refs()
ref_list = database.get_reflist()
# update current stories
if len(ref_list):
item = ref_list.pop(0)
_update_current_story(item)
gevent.sleep(6) gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt: except KeyboardInterrupt:
logging.info('Ending feed thread...') logging.info('Ending feed thread...')
except ValueError as e: except ValueError as e:

View File

@ -1,6 +1,8 @@
# QotNews settings # QotNews settings
# edit this file and save it as settings.py # edit this file and save it as settings.py
MAX_STORY_AGE = 3*24*60*60
# Feed Lengths # Feed Lengths
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site

View File

@ -72,5 +72,6 @@ export const logos = {
stuff: "", stuff: "",
substack: "", substack: "",
"the bulletin": "", "the bulletin": "",
tvnz: "",
webworm: "", webworm: "",
}; };