Compare commits

...

5 Commits

Author SHA1 Message Date
Jason Schwarzenberger
9cee370a25 tvnz icon 2020-11-10 14:10:02 +13:00
Jason Schwarzenberger
5efc6ef2d3 add related stories (in api only) 2020-11-10 14:09:56 +13:00
Jason Schwarzenberger
4ec50e20cb feed thread loop. 2020-11-10 10:10:38 +13:00
Jason Schwarzenberger
c1b7877f4b remove limit. 2020-11-09 17:54:50 +13:00
Jason Schwarzenberger
7b8cbfc9b9 try to make feed only determined by the max age. 2020-11-09 17:50:58 +13:00
6 changed files with 73 additions and 63 deletions

View File

@ -1,5 +1,4 @@
import json
from datetime import datetime, timedelta
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
@ -66,18 +65,26 @@ def get_story_by_ref(ref):
session = Session()
return session.query(Story).filter(Story.ref==ref).first()
def get_reflist(amount):
def get_stories_by_url(url):
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount)
return session.query(Story).\
filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_reflist():
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
def get_stories(amount):
def get_stories(maxage=60*60*24*2):
time = datetime.now().timestamp() - maxage
session = Session()
q = session.query(Reflist, Story.meta).\
join(Story).\
filter(Story.title != None).\
order_by(Story.meta['date'].desc()).\
limit(amount)
filter(Story.meta['date'] > time).\
order_by(Story.meta['date'].desc())
return [x[1] for x in q]
def put_ref(ref, sid, source):

View File

@ -6,16 +6,13 @@ logging.basicConfig(
import requests
import time
from bs4 import BeautifulSoup
import itertools
import settings
from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local
ONE_HOUR = 60*60
ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {}
for key, value in settings.SUBSTACK.items():
@ -27,36 +24,39 @@ sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def list():
feed = []
def get_list():
feeds = {}
if settings.NUM_HACKERNEWS:
feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT:
feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count']
feed += [(x, key) for x in publication.feed()[:count]]
feeds[key] = [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed
def get_article(url):
@ -124,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet')
return False
if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
logging.info('Story too old, removing')
return False

View File

@ -163,6 +163,8 @@ def get_sitemap_date(a):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return ''
class Sitemap(_Base):

View File

@ -15,6 +15,7 @@ import traceback
import time
from urllib.parse import urlparse, parse_qs
import settings
import database
import search
import feed
@ -27,9 +28,6 @@ from flask_cors import CORS
database.init()
search.init()
FEED_LENGTH = 75
news_index = 0
def new_id():
nid = gen_rand_id()
while database.get_story(nid):
@ -42,7 +40,7 @@ cors = CORS(flask_app)
@flask_app.route('/api')
def api():
stories = database.get_stories(FEED_LENGTH)
stories = database.get_stories(settings.MAX_STORY_AGE)
res = Response(json.dumps({"stories": stories}))
res.headers['content-type'] = 'application/json'
return res
@ -101,7 +99,9 @@ def submit():
def story(sid):
story = database.get_story(sid)
if story:
res = Response(json.dumps({"story": story.data}))
related = database.get_stories_by_url(story.meta['url'])
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json'
return res
else:
@ -144,51 +144,49 @@ def static_story(sid):
http_server = WSGIServer(('', 33842), flask_app)
def feed_thread():
global news_index
def _add_new_refs():
for ref, source in feed.get_list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
def _update_current_story(item):
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
def feed_thread():
ref_list = []
try:
while True:
# onboard new stories
if news_index == 0:
for ref, source in feed.list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
ref_list = database.get_reflist(FEED_LENGTH)
if not len(ref_list):
_add_new_refs()
ref_list = database.get_reflist()
# update current stories
if news_index < len(ref_list):
item = ref_list[news_index]
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
else:
logging.info('Skipping index: ' + str(news_index))
if len(ref_list):
item = ref_list.pop(0)
_update_current_story(item)
gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt:
logging.info('Ending feed thread...')
except ValueError as e:

View File

@ -1,6 +1,8 @@
# QotNews settings
# edit this file and save it as settings.py
MAX_STORY_AGE = 3*24*60*60
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site

View File

@ -72,5 +72,6 @@ export const logos = {
stuff: "",
substack: "",
"the bulletin": "",
tvnz: "",
webworm: "",
};