Compare commits

...

4 Commits

  1. 32
      apiserver/feed.py
  2. 6
      apiserver/feeds/hackernews.py
  3. 4
      apiserver/feeds/lobsters.py
  4. 4
      apiserver/feeds/manual.py
  5. 5
      apiserver/feeds/reddit.py
  6. 10
      apiserver/feeds/tildes.py
  7. 10
      apiserver/search.py
  8. 9
      apiserver/server.py
  9. 15
      apiserver/settings.py.example

@ -10,9 +10,6 @@ from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, manual, lobsters from feeds import hackernews, reddit, tildes, manual, lobsters
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
TWO_DAYS = 60*60*24*2 TWO_DAYS = 60*60*24*2
@ -33,29 +30,16 @@ def list():
return feed return feed
def get_article(url): def get_article(url):
try: if not settings.READER_URL:
params = {'source_url': url} logging.info('Readerserver not configured, aborting.')
headers = {'Referer': 'https://outline.com/'} return ''
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
html = r.json()['data']['html']
if 'URL is not supported by Outline' in html:
raise Exception('URL not supported by Outline')
return html
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
logging.info('Trying our server instead...') if url.startswith('https://twitter.com'):
logging.info('Replacing twitter.com url with nitter.net')
url = url.replace('twitter.com', 'nitter.net')
try: try:
r = requests.post(READ_API, data=dict(url=url), timeout=20) r = requests.post(settings.READER_URL, data=dict(url=url), timeout=20)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.text return r.text
@ -102,7 +86,7 @@ def update_story(story, is_manual=False):
return False return False
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing. Date: {}'.format(story['date']))
return False return False
if story.get('url', '') and not story.get('text', ''): if story.get('url', '') and not story.get('text', ''):

@ -61,11 +61,15 @@ def comment_count(i):
def story(ref): def story(ref):
r = api(API_ITEM, ref) r = api(API_ITEM, ref)
if not r: return False if not r:
logging.info('Bad Hackernews API response.')
return False
if 'deleted' in r: if 'deleted' in r:
logging.info('Story was deleted.')
return False return False
elif r.get('type', '') != 'story': elif r.get('type', '') != 'story':
logging.info('Type "{}" is not "story".'.format(r.get('type', '')))
return False return False
s = {} s = {}

@ -81,7 +81,9 @@ def iter_comments(flat_comments):
def story(ref): def story(ref):
r = api(API_ITEM, ref) r = api(API_ITEM, ref)
if not r: return False if not r:
logging.info('Bad Lobsters API response.')
return False
s = {} s = {}
try: try:

@ -27,7 +27,9 @@ def api(route):
def story(ref): def story(ref):
html = api(ref) html = api(ref)
if not html: return False if not html:
logging.info('Bad http GET response.')
return False
soup = BeautifulSoup(html, features='html.parser') soup = BeautifulSoup(html, features='html.parser')

@ -59,7 +59,9 @@ def comment(i):
def story(ref): def story(ref):
try: try:
r = reddit.submission(ref) r = reddit.submission(ref)
if not r: return False if not r:
logging.info('Bad Reddit API response.')
return False
s = {} s = {}
s['author'] = r.author.name if r.author else '[Deleted]' s['author'] = r.author.name if r.author else '[Deleted]'
@ -74,6 +76,7 @@ def story(ref):
s['num_comments'] = r.num_comments s['num_comments'] = r.num_comments
if s['score'] < 25 and s['num_comments'] < 10: if s['score'] < 25 and s['num_comments'] < 10:
logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
return False return False
if r.selftext: if r.selftext:

@ -71,11 +71,15 @@ def story(ref):
html = api(SITE_LINK(group_lookup[ref], ref)) html = api(SITE_LINK(group_lookup[ref], ref))
else: else:
html = api(API_ITEM(ref)) html = api(API_ITEM(ref))
if not html: return False if not html:
logging.info('Bad Tildes API response.')
return False
soup = BeautifulSoup(html, features='html.parser') soup = BeautifulSoup(html, features='html.parser')
a = soup.find('article', class_='topic-full') a = soup.find('article', class_='topic-full')
if a is None: return False if a is None:
logging.info('Tildes <article> element not found.')
return False
h = a.find('header') h = a.find('header')
lu = h.find('a', class_='link-user') lu = h.find('a', class_='link-user')
@ -83,6 +87,7 @@ def story(ref):
error = a.find('div', class_='text-error') error = a.find('div', class_='text-error')
if error: if error:
if 'deleted' in error.string or 'removed' in error.string: if 'deleted' in error.string or 'removed' in error.string:
logging.info('Article was deleted or removed.')
return False return False
s = {} s = {}
@ -103,6 +108,7 @@ def story(ref):
s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0 s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
if s['score'] < 8 and s['num_comments'] < 6: if s['score'] < 8 and s['num_comments'] < 6:
logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
return False return False
td = a.find('div', class_='topic-full-text') td = a.find('div', class_='topic-full-text')

@ -4,12 +4,13 @@ logging.basicConfig(
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
import settings
MEILI_URL = 'http://127.0.0.1:7700/' SEARCH_ENABLED = bool(settings.MEILI_URL)
def meili_api(method, route, json=None, params=None): def meili_api(method, route, json=None, params=None):
try: try:
r = method(MEILI_URL + route, json=json, params=params, timeout=4) r = method(settings.MEILI_URL + route, json=json, params=params, timeout=4)
if r.status_code > 299: if r.status_code > 299:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -35,15 +36,20 @@ def update_attributes():
return r return r
def init(): def init():
if not SEARCH_ENABLED:
logging.info('Search is not enabled, skipping init.')
return
print(create_index()) print(create_index())
update_rankings() update_rankings()
update_attributes() update_attributes()
def put_story(story): def put_story(story):
if not SEARCH_ENABLED: return
to_add = dict(title=story['title'], id=story['id'], date=story['date']) to_add = dict(title=story['title'], id=story['id'], date=story['date'])
return meili_api(requests.post, 'indexes/qotnews/documents', [to_add]) return meili_api(requests.post, 'indexes/qotnews/documents', [to_add])
def search(q): def search(q):
if not SEARCH_ENABLED: return []
params = dict(q=q, limit=250) params = dict(q=q, limit=250)
r = meili_api(requests.get, 'indexes/qotnews/search', params=params) r = meili_api(requests.get, 'indexes/qotnews/search', params=params)
return r['hits'] return r['hits']

@ -169,9 +169,10 @@ def feed_thread():
continue continue
try: try:
nid = new_id() nid = new_id()
logging.info('Adding ref: {}, id: {}, source: {}'.format(ref, nid, source))
database.put_ref(ref, nid, source) database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError: except database.IntegrityError:
logging.info('Already have ID / ref, skipping.')
continue continue
ref_list = database.get_reflist(FEED_LENGTH) ref_list = database.get_reflist(FEED_LENGTH)
@ -186,7 +187,7 @@ def feed_thread():
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index)) logging.info('Updating {} story: {}, index: {}'.format(story['source'], story['ref'], news_index))
valid = feed.update_story(story) valid = feed.update_story(story)
if valid: if valid:
@ -209,10 +210,10 @@ def feed_thread():
logging.critical('feed_thread error: {} {}'.format(e.__class__.__name__, e)) logging.critical('feed_thread error: {} {}'.format(e.__class__.__name__, e))
http_server.stop() http_server.stop()
print('Starting Feed thread...') logging.info('Starting Feed thread...')
gevent.spawn(feed_thread) gevent.spawn(feed_thread)
print('Starting HTTP thread...') logging.info('Starting HTTP thread...')
try: try:
http_server.serve_forever() http_server.serve_forever()
except KeyboardInterrupt: except KeyboardInterrupt:

@ -6,9 +6,18 @@
# set to 0 to disable that site # set to 0 to disable that site
NUM_HACKERNEWS = 15 NUM_HACKERNEWS = 15
NUM_LOBSTERS = 10 NUM_LOBSTERS = 10
NUM_REDDIT = 10 NUM_REDDIT = 15
NUM_TILDES = 5 NUM_TILDES = 5
# Meilisearch server URL
# Leave blank if not using search
#MEILI_URL = 'http://127.0.0.1:7700/'
MEILI_URL = ''
# Readerserver URL
# Leave blank if not using, but that defeats the whole point
READER_URL = 'http://127.0.0.1:33843/'
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit
REDDIT_CLIENT_ID = '' REDDIT_CLIENT_ID = ''
@ -25,9 +34,7 @@ SUBREDDITS = [
'PhilosophyofScience', 'PhilosophyofScience',
'StateOfTheUnion', 'StateOfTheUnion',
'TheAgora', 'TheAgora',
'TrueFilm',
'TrueReddit', 'TrueReddit',
'UniversityofReddit',
'culturalstudies', 'culturalstudies',
'hardscience', 'hardscience',
'indepthsports', 'indepthsports',
@ -37,6 +44,6 @@ SUBREDDITS = [
'resilientcommunities', 'resilientcommunities',
'worldevents', 'worldevents',
'StallmanWasRight', 'StallmanWasRight',
'DarkFuturology',
'EverythingScience', 'EverythingScience',
'longevity',
] ]

Loading…
Cancel
Save