Compare commits

..

No commits in common. "9cee370a25f9b906134c5ca547912983304d3d42" and "bfa4108a8e532e704c445e2684ae7cbc3003c362" have entirely different histories.

6 changed files with 63 additions and 73 deletions

View File

@ -1,4 +1,5 @@
from datetime import datetime, timedelta import json
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
@ -65,26 +66,18 @@ def get_story_by_ref(ref):
session = Session() session = Session()
return session.query(Story).filter(Story.ref==ref).first() return session.query(Story).filter(Story.ref==ref).first()
def get_stories_by_url(url): def get_reflist(amount):
session = Session() session = Session()
return session.query(Story).\ q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount)
filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_reflist():
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
def get_stories(maxage=60*60*24*2): def get_stories(amount):
time = datetime.now().timestamp() - maxage
session = Session() session = Session()
q = session.query(Reflist, Story.meta).\ q = session.query(Reflist, Story.meta).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
filter(Story.meta['date'] > time).\ order_by(Story.meta['date'].desc()).\
order_by(Story.meta['date'].desc()) limit(amount)
return [x[1] for x in q] return [x[1] for x in q]
def put_ref(ref, sid, source): def put_ref(ref, sid, source):

View File

@ -6,13 +6,16 @@ logging.basicConfig(
import requests import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import itertools
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local from scrapers import outline, declutter, local
ONE_HOUR = 60*60
ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
@ -24,39 +27,36 @@ sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def get_list(): def list():
feeds = {} feed = []
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT: if settings.NUM_REDDIT:
feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES: if settings.NUM_TILDES:
feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK: if settings.NUM_SUBSTACK:
feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count'] count = settings.SUBSTACK[key]['count']
feeds[key] = [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0 count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes') excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz') tz = settings.CATEGORY[key].get('tz')
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] feed += [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0 count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes') excludes = settings.SITEMAP[key].get('excludes')
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] feed += [(x, key) for x in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed return feed
def get_article(url): def get_article(url):
@ -124,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time(): if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

View File

@ -163,8 +163,6 @@ def get_sitemap_date(a):
return a.find('lastmod').text return a.find('lastmod').text
if a.find('news:publication_date'): if a.find('news:publication_date'):
return a.find('news:publication_date').text return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return '' return ''
class Sitemap(_Base): class Sitemap(_Base):

View File

@ -15,7 +15,6 @@ import traceback
import time import time
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import settings
import database import database
import search import search
import feed import feed
@ -28,6 +27,9 @@ from flask_cors import CORS
database.init() database.init()
search.init() search.init()
FEED_LENGTH = 75
news_index = 0
def new_id(): def new_id():
nid = gen_rand_id() nid = gen_rand_id()
while database.get_story(nid): while database.get_story(nid):
@ -40,7 +42,7 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(settings.MAX_STORY_AGE) stories = database.get_stories(FEED_LENGTH)
res = Response(json.dumps({"stories": stories})) res = Response(json.dumps({"stories": stories}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -99,9 +101,7 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
related = database.get_stories_by_url(story.meta['url']) res = Response(json.dumps({"story": story.data}))
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -144,49 +144,51 @@ def static_story(sid):
http_server = WSGIServer(('', 33842), flask_app) http_server = WSGIServer(('', 33842), flask_app)
def _add_new_refs():
for ref, source in feed.get_list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
def _update_current_story(item):
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
def feed_thread(): def feed_thread():
ref_list = [] global news_index
try: try:
while True: while True:
# onboard new stories # onboard new stories
if not len(ref_list): if news_index == 0:
_add_new_refs() for ref, source in feed.list():
ref_list = database.get_reflist() if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
ref_list = database.get_reflist(FEED_LENGTH)
# update current stories # update current stories
if len(ref_list): if news_index < len(ref_list):
item = ref_list.pop(0) item = ref_list[news_index]
_update_current_story(item)
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
else:
logging.info('Skipping index: ' + str(news_index))
gevent.sleep(6) gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt: except KeyboardInterrupt:
logging.info('Ending feed thread...') logging.info('Ending feed thread...')
except ValueError as e: except ValueError as e:

View File

@ -1,8 +1,6 @@
# QotNews settings # QotNews settings
# edit this file and save it as settings.py # edit this file and save it as settings.py
MAX_STORY_AGE = 3*24*60*60
# Feed Lengths # Feed Lengths
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site

View File

@ -72,6 +72,5 @@ export const logos = {
stuff: "", stuff: "",
substack: "", substack: "",
"the bulletin": "", "the bulletin": "",
tvnz: "",
webworm: "", webworm: "",
}; };