Compare commits

..

No commits in common. "9cee370a25f9b906134c5ca547912983304d3d42" and "bfa4108a8e532e704c445e2684ae7cbc3003c362" have entirely different histories.

6 changed files with 63 additions and 73 deletions

View File

@ -1,4 +1,5 @@
from datetime import datetime, timedelta import json
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
@ -65,26 +66,18 @@ def get_story_by_ref(ref):
session = Session() session = Session()
return session.query(Story).filter(Story.ref==ref).first() return session.query(Story).filter(Story.ref==ref).first()
def get_stories_by_url(url): def get_reflist(amount):
session = Session() session = Session()
return session.query(Story).\ q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount)
filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_reflist():
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
def get_stories(maxage=60*60*24*2): def get_stories(amount):
time = datetime.now().timestamp() - maxage
session = Session() session = Session()
q = session.query(Reflist, Story.meta).\ q = session.query(Reflist, Story.meta).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
filter(Story.meta['date'] > time).\ order_by(Story.meta['date'].desc()).\
order_by(Story.meta['date'].desc()) limit(amount)
return [x[1] for x in q] return [x[1] for x in q]
def put_ref(ref, sid, source): def put_ref(ref, sid, source):

View File

@ -6,13 +6,16 @@ logging.basicConfig(
import requests import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import itertools
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local from scrapers import outline, declutter, local
ONE_HOUR = 60*60
ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
@ -24,39 +27,36 @@ sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def get_list(): def list():
feeds = {} feed = []
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT: if settings.NUM_REDDIT:
feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES: if settings.NUM_TILDES:
feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK: if settings.NUM_SUBSTACK:
feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count'] count = settings.SUBSTACK[key]['count']
feeds[key] = [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0 count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes') excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz') tz = settings.CATEGORY[key].get('tz')
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] feed += [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0 count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes') excludes = settings.SITEMAP[key].get('excludes')
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] feed += [(x, key) for x in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed return feed
def get_article(url): def get_article(url):
@ -124,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time(): if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

View File

@ -163,8 +163,6 @@ def get_sitemap_date(a):
return a.find('lastmod').text return a.find('lastmod').text
if a.find('news:publication_date'): if a.find('news:publication_date'):
return a.find('news:publication_date').text return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return '' return ''
class Sitemap(_Base): class Sitemap(_Base):

View File

@ -15,7 +15,6 @@ import traceback
import time import time
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import settings
import database import database
import search import search
import feed import feed
@ -28,6 +27,9 @@ from flask_cors import CORS
database.init() database.init()
search.init() search.init()
FEED_LENGTH = 75
news_index = 0
def new_id(): def new_id():
nid = gen_rand_id() nid = gen_rand_id()
while database.get_story(nid): while database.get_story(nid):
@ -40,7 +42,7 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(settings.MAX_STORY_AGE) stories = database.get_stories(FEED_LENGTH)
res = Response(json.dumps({"stories": stories})) res = Response(json.dumps({"stories": stories}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -99,9 +101,7 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
related = database.get_stories_by_url(story.meta['url']) res = Response(json.dumps({"story": story.data}))
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -144,8 +144,14 @@ def static_story(sid):
http_server = WSGIServer(('', 33842), flask_app) http_server = WSGIServer(('', 33842), flask_app)
def _add_new_refs(): def feed_thread():
for ref, source in feed.get_list(): global news_index
try:
while True:
# onboard new stories
if news_index == 0:
for ref, source in feed.list():
if database.get_story_by_ref(ref): if database.get_story_by_ref(ref):
continue continue
try: try:
@ -155,13 +161,18 @@ def _add_new_refs():
except database.IntegrityError: except database.IntegrityError:
continue continue
def _update_current_story(item): ref_list = database.get_reflist(FEED_LENGTH)
# update current stories
if news_index < len(ref_list):
item = ref_list[news_index]
try: try:
story = database.get_story(item['sid']).data story = database.get_story(item['sid']).data
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: {}'.format(str(story['ref']))) logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story) valid = feed.update_story(story)
if valid: if valid:
@ -170,23 +181,14 @@ def _update_current_story(item):
else: else:
database.del_ref(item['ref']) database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref'])) logging.info('Removed ref {}'.format(item['ref']))
else:
def feed_thread(): logging.info('Skipping index: ' + str(news_index))
ref_list = []
try:
while True:
# onboard new stories
if not len(ref_list):
_add_new_refs()
ref_list = database.get_reflist()
# update current stories
if len(ref_list):
item = ref_list.pop(0)
_update_current_story(item)
gevent.sleep(6) gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt: except KeyboardInterrupt:
logging.info('Ending feed thread...') logging.info('Ending feed thread...')
except ValueError as e: except ValueError as e:

View File

@ -1,8 +1,6 @@
# QotNews settings # QotNews settings
# edit this file and save it as settings.py # edit this file and save it as settings.py
MAX_STORY_AGE = 3*24*60*60
# Feed Lengths # Feed Lengths
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site

View File

@ -72,6 +72,5 @@ export const logos = {
stuff: "", stuff: "",
substack: "", substack: "",
"the bulletin": "", "the bulletin": "",
tvnz: "",
webworm: "", webworm: "",
}; };