Improve logging, sends tweets to nitter.net

Remove outline API
Include option to disable readerserver
2022-03-05 23:48:46 +00:00 · 2022-03-05 22:05:29 +00:00 · 2022-03-05 22:04:25 +00:00 · 2022-03-05 21:58:35 +00:00
9 changed files with 55 additions and 40 deletions
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -10,9 +10,6 @@ from bs4 import BeautifulSoup
 import settings
 from feeds import hackernews, reddit, tildes, manual, lobsters

-OUTLINE_API = 'https://api.outline.com/v3/parse_article'
-READ_API = 'http://127.0.0.1:33843'
-
 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
 TWO_DAYS = 60*60*24*2

@@ -33,29 +30,16 @@ def list():
    return feed

 def get_article(url):
-    try:
-        params = {'source_url': url}
-        headers = {'Referer': 'https://outline.com/'}
-        r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
-        if r.status_code == 429:
-            logging.info('Rate limited by outline, sleeping 30s and skipping...')
-            time.sleep(30)
-            return ''
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        html = r.json()['data']['html']
-        if 'URL is not supported by Outline' in html:
-            raise Exception('URL not supported by Outline')
-        return html
-    except KeyboardInterrupt:
-        raise
-    except BaseException as e:
-        logging.error('Problem outlining article: {}'.format(str(e)))
+    if not settings.READER_URL:
+        logging.info('Readerserver not configured, aborting.')
+        return ''

-    logging.info('Trying our server instead...')
+    if url.startswith('https://twitter.com'):
+        logging.info('Replacing twitter.com url with nitter.net')
+        url = url.replace('twitter.com', 'nitter.net')

    try:
-        r = requests.post(READ_API, data=dict(url=url), timeout=20)
+        r = requests.post(settings.READER_URL, data=dict(url=url), timeout=20)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.text
@@ -102,7 +86,7 @@ def update_story(story, is_manual=False):
        return False

    if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
-        logging.info('Story too old, removing')
+        logging.info('Story too old, removing. Date: {}'.format(story['date']))
        return False

    if story.get('url', '') and not story.get('text', ''):
--- a/apiserver/feeds/hackernews.py
+++ b/apiserver/feeds/hackernews.py
@@ -61,11 +61,15 @@ def comment_count(i):

 def story(ref):
    r = api(API_ITEM, ref)
-    if not r: return False
+    if not r:
+        logging.info('Bad Hackernews API response.')
+        return False

    if 'deleted' in r:
+        logging.info('Story was deleted.')
        return False
    elif r.get('type', '') != 'story':
+        logging.info('Type "{}" is not "story".'.format(r.get('type', '')))
        return False

    s = {}
--- a/apiserver/feeds/lobsters.py
+++ b/apiserver/feeds/lobsters.py
@@ -81,7 +81,9 @@ def iter_comments(flat_comments):

 def story(ref):
    r = api(API_ITEM, ref)
-    if not r: return False
+    if not r:
+        logging.info('Bad Lobsters API response.')
+        return False

    s = {}
    try:
--- a/apiserver/feeds/manual.py
+++ b/apiserver/feeds/manual.py
@@ -27,7 +27,9 @@ def api(route):

 def story(ref):
    html = api(ref)
-    if not html: return False
+    if not html:
+        logging.info('Bad http GET response.')
+        return False

    soup = BeautifulSoup(html, features='html.parser')

--- a/apiserver/feeds/reddit.py
+++ b/apiserver/feeds/reddit.py
@@ -59,7 +59,9 @@ def comment(i):
 def story(ref):
    try:
        r = reddit.submission(ref)
-        if not r: return False
+        if not r:
+            logging.info('Bad Reddit API response.')
+            return False

        s = {}
        s['author'] = r.author.name if r.author else '[Deleted]'
@@ -74,6 +76,7 @@ def story(ref):
        s['num_comments'] = r.num_comments

        if s['score'] < 25 and s['num_comments'] < 10:
+            logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
            return False

        if r.selftext:
--- a/apiserver/feeds/tildes.py
+++ b/apiserver/feeds/tildes.py
@@ -71,11 +71,15 @@ def story(ref):
        html = api(SITE_LINK(group_lookup[ref], ref))
    else:
        html = api(API_ITEM(ref))
-    if not html: return False
+    if not html:
+        logging.info('Bad Tildes API response.')
+        return False

    soup = BeautifulSoup(html, features='html.parser')
    a = soup.find('article', class_='topic-full')
-    if a is None: return False
+    if a is None:
+        logging.info('Tildes <article> element not found.')
+        return False

    h = a.find('header')
    lu = h.find('a', class_='link-user')
@@ -83,6 +87,7 @@ def story(ref):
    error = a.find('div', class_='text-error')
    if error:
        if 'deleted' in error.string or 'removed' in error.string:
+            logging.info('Article was deleted or removed.')
            return False

    s = {}
@@ -103,6 +108,7 @@ def story(ref):
    s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0

    if s['score'] < 8 and s['num_comments'] < 6:
+        logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
        return False

    td = a.find('div', class_='topic-full-text')
--- a/apiserver/search.py
+++ b/apiserver/search.py
@@ -4,12 +4,13 @@ logging.basicConfig(
        level=logging.DEBUG)

 import requests
+import settings

-MEILI_URL = 'http://127.0.0.1:7700/'
+SEARCH_ENABLED = bool(settings.MEILI_URL)

 def meili_api(method, route, json=None, params=None):
    try:
-        r = method(MEILI_URL + route, json=json, params=params, timeout=4)
+        r = method(settings.MEILI_URL + route, json=json, params=params, timeout=4)
        if r.status_code > 299:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
@@ -35,15 +36,20 @@ def update_attributes():
    return r

 def init():
+    if not SEARCH_ENABLED:
+        logging.info('Search is not enabled, skipping init.')
+        return
    print(create_index())
    update_rankings()
    update_attributes()

 def put_story(story):
+    if not SEARCH_ENABLED: return
    to_add = dict(title=story['title'], id=story['id'], date=story['date'])
    return meili_api(requests.post, 'indexes/qotnews/documents', [to_add])

 def search(q):
+    if not SEARCH_ENABLED: return []
    params = dict(q=q, limit=250)
    r = meili_api(requests.get, 'indexes/qotnews/search', params=params)
    return r['hits']
--- a/apiserver/server.py
+++ b/apiserver/server.py
@@ -169,9 +169,10 @@ def feed_thread():
                        continue
                    try:
                        nid = new_id()
+                        logging.info('Adding ref: {}, id: {}, source: {}'.format(ref, nid, source))
                        database.put_ref(ref, nid, source)
-                        logging.info('Added ref ' + ref)
                    except database.IntegrityError:
+                        logging.info('Already have ID / ref, skipping.')
                        continue

            ref_list = database.get_reflist(FEED_LENGTH)
@@ -186,7 +187,7 @@ def feed_thread():
                except AttributeError:
                    story = dict(id=item['sid'], ref=item['ref'], source=item['source'])

-                logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
+                logging.info('Updating {} story: {}, index: {}'.format(story['source'], story['ref'], news_index))

                valid = feed.update_story(story)
                if valid:
@@ -209,10 +210,10 @@ def feed_thread():
        logging.critical('feed_thread error: {} {}'.format(e.__class__.__name__, e))
        http_server.stop()

-print('Starting Feed thread...')
+logging.info('Starting Feed thread...')
 gevent.spawn(feed_thread)

-print('Starting HTTP thread...')
+logging.info('Starting HTTP thread...')
 try:
    http_server.serve_forever()
 except KeyboardInterrupt:
--- a/apiserver/settings.py.example
+++ b/apiserver/settings.py.example
@@ -6,9 +6,18 @@
 # set to 0 to disable that site
 NUM_HACKERNEWS = 15
 NUM_LOBSTERS = 10
-NUM_REDDIT = 10
+NUM_REDDIT = 15
 NUM_TILDES = 5

+# Meilisearch server URL
+# Leave blank if not using search
+#MEILI_URL = 'http://127.0.0.1:7700/'
+MEILI_URL = ''
+
+# Readerserver URL
+# Leave blank if not using, but that defeats the whole point
+READER_URL = 'http://127.0.0.1:33843/'
+
 # Reddit account info
 # leave blank if not using Reddit
 REDDIT_CLIENT_ID = ''
@@ -25,9 +34,7 @@ SUBREDDITS = [
    'PhilosophyofScience',
    'StateOfTheUnion',
    'TheAgora',
-    'TrueFilm',
    'TrueReddit',
-    'UniversityofReddit',
    'culturalstudies',
    'hardscience',
    'indepthsports',
@@ -37,6 +44,6 @@ SUBREDDITS = [
    'resilientcommunities',
    'worldevents',
    'StallmanWasRight',
-    'DarkFuturology',
    'EverythingScience',
+    'longevity',
 ]
Author	SHA1	Message	Date
Tanner Collin	a25457254f	Improve logging, sends tweets to nitter.net	2022-03-05 23:48:46 +00:00
Tanner Collin	a693ea5342	Remove outline API	2022-03-05 22:05:29 +00:00
Tanner Collin	7386e1d8b0	Include option to disable readerserver	2022-03-05 22:04:25 +00:00
Tanner Collin	f8e8597e3a	Include option to disable search	2022-03-05 21:58:35 +00:00