Compare commits
4 Commits
55c282ee69
...
a25457254f
Author | SHA1 | Date | |
---|---|---|---|
a25457254f | |||
a693ea5342 | |||
7386e1d8b0 | |||
f8e8597e3a |
|
@ -10,9 +10,6 @@ from bs4 import BeautifulSoup
|
||||||
import settings
|
import settings
|
||||||
from feeds import hackernews, reddit, tildes, manual, lobsters
|
from feeds import hackernews, reddit, tildes, manual, lobsters
|
||||||
|
|
||||||
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
|
||||||
READ_API = 'http://127.0.0.1:33843'
|
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
|
||||||
TWO_DAYS = 60*60*24*2
|
TWO_DAYS = 60*60*24*2
|
||||||
|
|
||||||
|
@ -33,29 +30,16 @@ def list():
|
||||||
return feed
|
return feed
|
||||||
|
|
||||||
def get_article(url):
|
def get_article(url):
|
||||||
try:
|
if not settings.READER_URL:
|
||||||
params = {'source_url': url}
|
logging.info('Readerserver not configured, aborting.')
|
||||||
headers = {'Referer': 'https://outline.com/'}
|
|
||||||
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
|
|
||||||
if r.status_code == 429:
|
|
||||||
logging.info('Rate limited by outline, sleeping 30s and skipping...')
|
|
||||||
time.sleep(30)
|
|
||||||
return ''
|
return ''
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
html = r.json()['data']['html']
|
|
||||||
if 'URL is not supported by Outline' in html:
|
|
||||||
raise Exception('URL not supported by Outline')
|
|
||||||
return html
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem outlining article: {}'.format(str(e)))
|
|
||||||
|
|
||||||
logging.info('Trying our server instead...')
|
if url.startswith('https://twitter.com'):
|
||||||
|
logging.info('Replacing twitter.com url with nitter.net')
|
||||||
|
url = url.replace('twitter.com', 'nitter.net')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.post(READ_API, data=dict(url=url), timeout=20)
|
r = requests.post(settings.READER_URL, data=dict(url=url), timeout=20)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.text
|
return r.text
|
||||||
|
@ -102,7 +86,7 @@ def update_story(story, is_manual=False):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
|
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
|
||||||
logging.info('Story too old, removing')
|
logging.info('Story too old, removing. Date: {}'.format(story['date']))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if story.get('url', '') and not story.get('text', ''):
|
if story.get('url', '') and not story.get('text', ''):
|
||||||
|
|
|
@ -61,11 +61,15 @@ def comment_count(i):
|
||||||
|
|
||||||
def story(ref):
|
def story(ref):
|
||||||
r = api(API_ITEM, ref)
|
r = api(API_ITEM, ref)
|
||||||
if not r: return False
|
if not r:
|
||||||
|
logging.info('Bad Hackernews API response.')
|
||||||
|
return False
|
||||||
|
|
||||||
if 'deleted' in r:
|
if 'deleted' in r:
|
||||||
|
logging.info('Story was deleted.')
|
||||||
return False
|
return False
|
||||||
elif r.get('type', '') != 'story':
|
elif r.get('type', '') != 'story':
|
||||||
|
logging.info('Type "{}" is not "story".'.format(r.get('type', '')))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
s = {}
|
s = {}
|
||||||
|
|
|
@ -81,7 +81,9 @@ def iter_comments(flat_comments):
|
||||||
|
|
||||||
def story(ref):
|
def story(ref):
|
||||||
r = api(API_ITEM, ref)
|
r = api(API_ITEM, ref)
|
||||||
if not r: return False
|
if not r:
|
||||||
|
logging.info('Bad Lobsters API response.')
|
||||||
|
return False
|
||||||
|
|
||||||
s = {}
|
s = {}
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -27,7 +27,9 @@ def api(route):
|
||||||
|
|
||||||
def story(ref):
|
def story(ref):
|
||||||
html = api(ref)
|
html = api(ref)
|
||||||
if not html: return False
|
if not html:
|
||||||
|
logging.info('Bad http GET response.')
|
||||||
|
return False
|
||||||
|
|
||||||
soup = BeautifulSoup(html, features='html.parser')
|
soup = BeautifulSoup(html, features='html.parser')
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,9 @@ def comment(i):
|
||||||
def story(ref):
|
def story(ref):
|
||||||
try:
|
try:
|
||||||
r = reddit.submission(ref)
|
r = reddit.submission(ref)
|
||||||
if not r: return False
|
if not r:
|
||||||
|
logging.info('Bad Reddit API response.')
|
||||||
|
return False
|
||||||
|
|
||||||
s = {}
|
s = {}
|
||||||
s['author'] = r.author.name if r.author else '[Deleted]'
|
s['author'] = r.author.name if r.author else '[Deleted]'
|
||||||
|
@ -74,6 +76,7 @@ def story(ref):
|
||||||
s['num_comments'] = r.num_comments
|
s['num_comments'] = r.num_comments
|
||||||
|
|
||||||
if s['score'] < 25 and s['num_comments'] < 10:
|
if s['score'] < 25 and s['num_comments'] < 10:
|
||||||
|
logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if r.selftext:
|
if r.selftext:
|
||||||
|
|
|
@ -71,11 +71,15 @@ def story(ref):
|
||||||
html = api(SITE_LINK(group_lookup[ref], ref))
|
html = api(SITE_LINK(group_lookup[ref], ref))
|
||||||
else:
|
else:
|
||||||
html = api(API_ITEM(ref))
|
html = api(API_ITEM(ref))
|
||||||
if not html: return False
|
if not html:
|
||||||
|
logging.info('Bad Tildes API response.')
|
||||||
|
return False
|
||||||
|
|
||||||
soup = BeautifulSoup(html, features='html.parser')
|
soup = BeautifulSoup(html, features='html.parser')
|
||||||
a = soup.find('article', class_='topic-full')
|
a = soup.find('article', class_='topic-full')
|
||||||
if a is None: return False
|
if a is None:
|
||||||
|
logging.info('Tildes <article> element not found.')
|
||||||
|
return False
|
||||||
|
|
||||||
h = a.find('header')
|
h = a.find('header')
|
||||||
lu = h.find('a', class_='link-user')
|
lu = h.find('a', class_='link-user')
|
||||||
|
@ -83,6 +87,7 @@ def story(ref):
|
||||||
error = a.find('div', class_='text-error')
|
error = a.find('div', class_='text-error')
|
||||||
if error:
|
if error:
|
||||||
if 'deleted' in error.string or 'removed' in error.string:
|
if 'deleted' in error.string or 'removed' in error.string:
|
||||||
|
logging.info('Article was deleted or removed.')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
s = {}
|
s = {}
|
||||||
|
@ -103,6 +108,7 @@ def story(ref):
|
||||||
s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
|
s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
|
||||||
|
|
||||||
if s['score'] < 8 and s['num_comments'] < 6:
|
if s['score'] < 8 and s['num_comments'] < 6:
|
||||||
|
logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
td = a.find('div', class_='topic-full-text')
|
td = a.find('div', class_='topic-full-text')
|
||||||
|
|
|
@ -4,12 +4,13 @@ logging.basicConfig(
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
import settings
|
||||||
|
|
||||||
MEILI_URL = 'http://127.0.0.1:7700/'
|
SEARCH_ENABLED = bool(settings.MEILI_URL)
|
||||||
|
|
||||||
def meili_api(method, route, json=None, params=None):
|
def meili_api(method, route, json=None, params=None):
|
||||||
try:
|
try:
|
||||||
r = method(MEILI_URL + route, json=json, params=params, timeout=4)
|
r = method(settings.MEILI_URL + route, json=json, params=params, timeout=4)
|
||||||
if r.status_code > 299:
|
if r.status_code > 299:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.json()
|
return r.json()
|
||||||
|
@ -35,15 +36,20 @@ def update_attributes():
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def init():
|
def init():
|
||||||
|
if not SEARCH_ENABLED:
|
||||||
|
logging.info('Search is not enabled, skipping init.')
|
||||||
|
return
|
||||||
print(create_index())
|
print(create_index())
|
||||||
update_rankings()
|
update_rankings()
|
||||||
update_attributes()
|
update_attributes()
|
||||||
|
|
||||||
def put_story(story):
|
def put_story(story):
|
||||||
|
if not SEARCH_ENABLED: return
|
||||||
to_add = dict(title=story['title'], id=story['id'], date=story['date'])
|
to_add = dict(title=story['title'], id=story['id'], date=story['date'])
|
||||||
return meili_api(requests.post, 'indexes/qotnews/documents', [to_add])
|
return meili_api(requests.post, 'indexes/qotnews/documents', [to_add])
|
||||||
|
|
||||||
def search(q):
|
def search(q):
|
||||||
|
if not SEARCH_ENABLED: return []
|
||||||
params = dict(q=q, limit=250)
|
params = dict(q=q, limit=250)
|
||||||
r = meili_api(requests.get, 'indexes/qotnews/search', params=params)
|
r = meili_api(requests.get, 'indexes/qotnews/search', params=params)
|
||||||
return r['hits']
|
return r['hits']
|
||||||
|
|
|
@ -169,9 +169,10 @@ def feed_thread():
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
nid = new_id()
|
nid = new_id()
|
||||||
|
logging.info('Adding ref: {}, id: {}, source: {}'.format(ref, nid, source))
|
||||||
database.put_ref(ref, nid, source)
|
database.put_ref(ref, nid, source)
|
||||||
logging.info('Added ref ' + ref)
|
|
||||||
except database.IntegrityError:
|
except database.IntegrityError:
|
||||||
|
logging.info('Already have ID / ref, skipping.')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ref_list = database.get_reflist(FEED_LENGTH)
|
ref_list = database.get_reflist(FEED_LENGTH)
|
||||||
|
@ -186,7 +187,7 @@ def feed_thread():
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
|
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
|
||||||
|
|
||||||
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
|
logging.info('Updating {} story: {}, index: {}'.format(story['source'], story['ref'], news_index))
|
||||||
|
|
||||||
valid = feed.update_story(story)
|
valid = feed.update_story(story)
|
||||||
if valid:
|
if valid:
|
||||||
|
@ -209,10 +210,10 @@ def feed_thread():
|
||||||
logging.critical('feed_thread error: {} {}'.format(e.__class__.__name__, e))
|
logging.critical('feed_thread error: {} {}'.format(e.__class__.__name__, e))
|
||||||
http_server.stop()
|
http_server.stop()
|
||||||
|
|
||||||
print('Starting Feed thread...')
|
logging.info('Starting Feed thread...')
|
||||||
gevent.spawn(feed_thread)
|
gevent.spawn(feed_thread)
|
||||||
|
|
||||||
print('Starting HTTP thread...')
|
logging.info('Starting HTTP thread...')
|
||||||
try:
|
try:
|
||||||
http_server.serve_forever()
|
http_server.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
|
|
@ -6,9 +6,18 @@
|
||||||
# set to 0 to disable that site
|
# set to 0 to disable that site
|
||||||
NUM_HACKERNEWS = 15
|
NUM_HACKERNEWS = 15
|
||||||
NUM_LOBSTERS = 10
|
NUM_LOBSTERS = 10
|
||||||
NUM_REDDIT = 10
|
NUM_REDDIT = 15
|
||||||
NUM_TILDES = 5
|
NUM_TILDES = 5
|
||||||
|
|
||||||
|
# Meilisearch server URL
|
||||||
|
# Leave blank if not using search
|
||||||
|
#MEILI_URL = 'http://127.0.0.1:7700/'
|
||||||
|
MEILI_URL = ''
|
||||||
|
|
||||||
|
# Readerserver URL
|
||||||
|
# Leave blank if not using, but that defeats the whole point
|
||||||
|
READER_URL = 'http://127.0.0.1:33843/'
|
||||||
|
|
||||||
# Reddit account info
|
# Reddit account info
|
||||||
# leave blank if not using Reddit
|
# leave blank if not using Reddit
|
||||||
REDDIT_CLIENT_ID = ''
|
REDDIT_CLIENT_ID = ''
|
||||||
|
@ -25,9 +34,7 @@ SUBREDDITS = [
|
||||||
'PhilosophyofScience',
|
'PhilosophyofScience',
|
||||||
'StateOfTheUnion',
|
'StateOfTheUnion',
|
||||||
'TheAgora',
|
'TheAgora',
|
||||||
'TrueFilm',
|
|
||||||
'TrueReddit',
|
'TrueReddit',
|
||||||
'UniversityofReddit',
|
|
||||||
'culturalstudies',
|
'culturalstudies',
|
||||||
'hardscience',
|
'hardscience',
|
||||||
'indepthsports',
|
'indepthsports',
|
||||||
|
@ -37,6 +44,6 @@ SUBREDDITS = [
|
||||||
'resilientcommunities',
|
'resilientcommunities',
|
||||||
'worldevents',
|
'worldevents',
|
||||||
'StallmanWasRight',
|
'StallmanWasRight',
|
||||||
'DarkFuturology',
|
|
||||||
'EverythingScience',
|
'EverythingScience',
|
||||||
|
'longevity',
|
||||||
]
|
]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user