Ignore dead and political stories

Fix Better HN api content extraction
Add Better HN as an API backup
2025-05-27 18:47:17 +00:00 · 2025-02-01 22:39:13 +00:00 · 2025-02-01 21:42:06 +00:00 · 2025-02-01 20:31:35 +00:00 · 2024-03-16 20:41:24 +00:00 · 2024-03-08 03:08:18 +00:00
16 changed files with 250 additions and 49 deletions
--- a/apiserver/database.py
+++ b/apiserver/database.py
@@ -5,7 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.exc import IntegrityError

-engine = create_engine('sqlite:///data/qotnews.sqlite', connect_args={'timeout': 180})
+engine = create_engine('sqlite:///data/qotnews.sqlite', connect_args={'timeout': 360})
 Session = sessionmaker(bind=engine)

 Base = declarative_base()
@@ -118,4 +118,5 @@ def get_story_list():
 if __name__ == '__main__':
    init()

-    print(get_story_by_ref('hgi3sy'))
+    #print(get_story_by_ref('hgi3sy'))
+    print(len(get_reflist(99999)))
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -9,6 +9,7 @@ from bs4 import BeautifulSoup

 import settings
 from feeds import hackernews, reddit, tildes, manual, lobsters
+import utils

 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
 TWO_DAYS = 60*60*24*2
@@ -68,6 +69,7 @@ def get_content_type(url):
 def update_story(story, is_manual=False):
    res = {}

+    try:
        if story['source'] == 'hackernews':
            res = hackernews.story(story['ref'])
        elif story['source'] == 'lobsters':
@@ -78,6 +80,10 @@ def update_story(story, is_manual=False):
            res = tildes.story(story['ref'])
        elif story['source'] == 'manual':
            res = manual.story(story['ref'])
+    except BaseException as e:
+        utils.alert_tanner('Problem updating {} story, ref {}: {}'.format(story['source'], story['ref'], str(e)))
+        logging.exception(e)
+        return False

    if res:
        story.update(res) # join dicts
@@ -100,6 +106,12 @@ def update_story(story, is_manual=False):
            logging.info(story['url'])
            return False

+        if 'trump' in story['title'].lower() or 'musk' in story['title'].lower():
+            logging.info('Trump / Musk story, skipping')
+            logging.info(story['url'])
+            return False
+
+
        logging.info('Getting article ' + story['url'])
        story['text'] = get_article(story['url'])
        if not story['text']: return False
--- a/apiserver/feeds/hackernews.py
+++ b/apiserver/feeds/hackernews.py
@@ -12,7 +12,8 @@ import requests
 from utils import clean

 API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
-API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
+ALG_API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
+BHN_API_ITEM = lambda x : 'https://api.hnpwa.com/v0/item/{}.json'.format(x)

 SITE_LINK = lambda x : 'https://news.ycombinator.com/item?id={}'.format(x)
 SITE_AUTHOR_LINK = lambda x : 'https://news.ycombinator.com/user?id={}'.format(x)
@@ -42,7 +43,7 @@ def api(route, ref=None):
 def feed():
    return [str(x) for x in api(API_TOPSTORIES) or []]

-def comment(i):
+def alg_comment(i):
    if 'author' not in i:
        return False

@@ -51,19 +52,19 @@ def comment(i):
    c['score'] = i.get('points', 0)
    c['date'] = i.get('created_at_i', 0)
    c['text'] = clean(i.get('text', '') or '')
-    c['comments'] = [comment(j) for j in i['children']]
+    c['comments'] = [alg_comment(j) for j in i['children']]
    c['comments'] = list(filter(bool, c['comments']))
    return c

-def comment_count(i):
+def alg_comment_count(i):
    alive = 1 if i['author'] else 0
-    return sum([comment_count(c) for c in i['comments']]) + alive
+    return sum([alg_comment_count(c) for c in i['comments']]) + alive

-def story(ref):
-    r = api(API_ITEM, ref)
+def alg_story(ref):
+    r = api(ALG_API_ITEM, ref)
    if not r:
-        logging.info('Bad Hackernews API response.')
-        return False
+        logging.info('Bad Algolia Hackernews API response.')
+        return None

    if 'deleted' in r:
        logging.info('Story was deleted.')
@@ -80,17 +81,85 @@ def story(ref):
    s['title'] = r.get('title', '')
    s['link'] = SITE_LINK(ref)
    s['url'] = r.get('url', '')
-    s['comments'] = [comment(i) for i in r['children']]
+    s['comments'] = [alg_comment(i) for i in r['children']]
    s['comments'] = list(filter(bool, s['comments']))
-    s['num_comments'] = comment_count(s) - 1
+    s['num_comments'] = alg_comment_count(s) - 1

    if 'text' in r and r['text']:
        s['text'] = clean(r['text'] or '')

    return s

+def bhn_comment(i):
+    if 'user' not in i:
+        return False
+
+    c = {}
+    c['author'] = i.get('user', '')
+    c['score'] = 0   # Not present?
+    c['date'] = i.get('time', 0)
+    c['text'] = clean(i.get('content', '') or '')
+    c['comments'] = [bhn_comment(j) for j in i['comments']]
+    c['comments'] = list(filter(bool, c['comments']))
+    return c
+
+def bhn_story(ref):
+    r = api(BHN_API_ITEM, ref)
+    if not r:
+        logging.info('Bad BetterHN Hackernews API response.')
+        return None
+
+    if 'deleted' in r:   # TODO: verify
+        logging.info('Story was deleted.')
+        return False
+    elif r.get('dead', False):
+        logging.info('Story was deleted.')
+        return False
+    elif r.get('type', '') != 'link':
+        logging.info('Type "{}" is not "link".'.format(r.get('type', '')))
+        return False
+
+    s = {}
+    s['author'] = r.get('user', '')
+    s['author_link'] = SITE_AUTHOR_LINK(r.get('user', ''))
+    s['score'] = r.get('points', 0)
+    s['date'] = r.get('time', 0)
+    s['title'] = r.get('title', '')
+    s['link'] = SITE_LINK(ref)
+    s['url'] = r.get('url', '')
+    if s['url'].startswith('item'):
+        s['url'] = SITE_LINK(ref)
+    s['comments'] = [bhn_comment(i) for i in r['comments']]
+    s['comments'] = list(filter(bool, s['comments']))
+    s['num_comments'] = r.get('comments_count', 0)
+
+    if 'content' in r and r['content']:
+        s['text'] = clean(r['content'] or '')
+
+    return s
+
+def story(ref):
+    s = alg_story(ref)
+    if s is None:
+        s = bhn_story(ref)
+    if not s:
+        return False
+
+
+    if s['score'] < 25 and s['num_comments'] < 10:
+        logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
+        return False
+
+    return s
+
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
    print(feed())
    #print(story(20763961))
    #print(story(20802050))
+
+    #print(story(42899834))   # type "job"
+    #print(story(42900076))   # Ask HN
+    #print(story(42898201))   # Show HN
+    #print(story(42899703))   # normal
+    print(story(42902678))   # bad title?
--- a/apiserver/feeds/lobsters.py
+++ b/apiserver/feeds/lobsters.py
@@ -50,7 +50,7 @@ def unix(date_str):
 def make_comment(i):
    c = {}
    try:
-        c['author'] = i['commenting_user']['username']
+        c['author'] = i['commenting_user']
    except KeyError:
        c['author'] = ''
    c['score'] = i.get('score', 0)
@@ -67,13 +67,13 @@ def iter_comments(flat_comments):
    parent_stack = []
    for comment in flat_comments:
        c = make_comment(comment)
-        indent = comment['indent_level']
+        indent = comment['depth']

-        if indent == 1:
+        if indent == 0:
            nested_comments.append(c)
            parent_stack = [c]
        else:
-            parent_stack = parent_stack[:indent-1]
+            parent_stack = parent_stack[:indent]
            p = parent_stack[-1]
            p['comments'].append(c)
            parent_stack.append(c)
@@ -87,7 +87,7 @@ def story(ref):

    s = {}
    try:
-        s['author'] = r['submitter_user']['username']
+        s['author'] = r['submitter_user']
        s['author_link'] = SITE_AUTHOR_LINK(s['author'])
    except KeyError:
        s['author'] = ''
@@ -103,6 +103,10 @@ def story(ref):
    s['comments'] = iter_comments(r['comments'])
    s['num_comments'] = r['comment_count']

+    if s['score'] < 15 and s['num_comments'] < 10:
+        logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
+        return False
+
    if 'description' in r and r['description']:
        s['text'] = clean(r['description'] or '')

@@ -112,5 +116,5 @@ def story(ref):
 if __name__ == '__main__':
    #print(feed())
    import json
-    print(json.dumps(story('fzvd1v')))
-    #print(story(20802050))
+    print(json.dumps(story('fzvd1v'), indent=4))
+    #print(json.dumps(story('ixyv5u'), indent=4))
--- a/apiserver/feeds/reddit.py
+++ b/apiserver/feeds/reddit.py
@@ -32,10 +32,7 @@ def feed():
        return [x.id for x in reddit.subreddit(subs).hot()]
    except KeyboardInterrupt:
        raise
-    except PRAWException as e:
-        logging.critical('Problem hitting reddit API: {}'.format(str(e)))
-        return []
-    except PrawcoreException as e:
+    except BaseException as e:
        logging.critical('Problem hitting reddit API: {}'.format(str(e)))
        return []

--- a/apiserver/feeds/tildes.py
+++ b/apiserver/feeds/tildes.py
@@ -107,7 +107,20 @@ def story(ref):
    ch = a.find('header', class_='topic-comments-header')
    s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0

-    if s['score'] < 8 and s['num_comments'] < 6:
+    if s['group'].split('.')[0] not in [
+        '~arts',
+        '~comp',
+        '~creative',
+        '~design',
+        '~engineering',
+        '~finance',
+        '~science',
+        '~tech',
+    ]:
+        logging.info('Group ({}) not in whitelist.'.format(s['group']))
+        return False
+
+    if s['score'] < 15 and s['num_comments'] < 10:
        logging.info('Score ({}) or num comments ({}) below threshold.'.format(s['score'], s['num_comments']))
        return False

--- a/apiserver/scripts/delete-story.py
+++ b/apiserver/scripts/delete-story.py
@@ -1,6 +1,8 @@
 import database
 import search
 import sys
+import settings
+import logging

 import json
 import requests
@@ -21,7 +23,7 @@ def database_del_story(sid):

 def search_del_story(sid):
    try:
-        r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2)
+        r = requests.delete(settings.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2)
        if r.status_code != 202:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
--- a/apiserver/scripts/fix-stories.py
+++ b/apiserver/scripts/fix-stories.py
@@ -1,3 +1,4 @@
+import time
 import json
 import logging

--- a/apiserver/scripts/reindex.py
+++ b/apiserver/scripts/reindex.py
--- a/apiserver/scripts/tests.py
+++ b/apiserver/scripts/tests.py
--- a/apiserver/utils.py
+++ b/apiserver/utils.py
@@ -8,6 +8,14 @@ import string

 from bleach.sanitizer import Cleaner

+def alert_tanner(message):
+    try:
+        logging.info('Alerting Tanner: ' + message)
+        params = dict(qotnews=message)
+        requests.get('https://tbot.tannercollin.com/message', params=params, timeout=4)
+    except BaseException as e:
+        logging.error('Problem alerting Tanner: ' + str(e))
+
 def gen_rand_id():
    return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))

--- a/readerserver/yarn.lock
+++ b/readerserver/yarn.lock
@@ -708,8 +708,8 @@ raw-body@2.4.3:
    unpipe "1.0.0"

 "readability@https://github.com/mozilla/readability":
-  version "0.4.2"
-  resolved "https://github.com/mozilla/readability#1d2cb030b32e753cc4b7c4ce8b64c3ce4dc1b2ff"
+  version "0.5.0"
+  resolved "https://github.com/mozilla/readability#39a5c5409fb653858b1832141895b882b9092b47"

 request-promise-core@1.1.4:
  version "1.1.4"
--- a/webclient/src/App.js
+++ b/webclient/src/App.js
@@ -3,6 +3,7 @@ import { BrowserRouter as Router, Route, Link, Switch } from 'react-router-dom';
 import localForage from 'localforage';
 import './Style-light.css';
 import './Style-dark.css';
+import './Style-black.css';
 import './Style-red.css';
 import './fonts/Fonts.css';
 import { BackwardDot, ForwardDot } from './utils.js';
@@ -39,6 +40,11 @@ class App extends React.Component {
 		localStorage.setItem('theme', 'dark');
 	}

+	black() {
+		this.setState({ theme: 'black' });
+		localStorage.setItem('theme', 'black');
+	}
+
 	red() {
 		this.setState({ theme: 'red' });
 		localStorage.setItem('theme', 'red');
@@ -72,7 +78,21 @@ class App extends React.Component {

 	render() {
 		const theme = this.state.theme;
-		document.body.style.backgroundColor = theme ? '#000' : '#eeeeee';
+
+		if (theme === 'dark') {
+			document.body.style.backgroundColor = '#1a1a1a';
+		} else if (theme === 'black') {
+			document.body.style.backgroundColor = '#000';
+		} else if (theme === 'red') {
+			document.body.style.backgroundColor = '#000';
+		} else {
+			document.body.style.backgroundColor = '#eeeeee';
+		}
+
+		const fullScreenAvailable = document.fullscreenEnabled ||
+			document.mozFullscreenEnabled ||
+			document.webkitFullscreenEnabled ||
+			document.msFullscreenEnabled;

 		return (
 			<div className={theme}>
@@ -81,17 +101,19 @@ class App extends React.Component {
 						<p>
 							<Link to='/'>QotNews</Link>

-							<span className='theme'><a href='#' onClick={() => this.light()}>Light</a> - <a href='#' onClick={() => this.dark()}>Dark</a> - <a href='#' onClick={() => this.red()}>Red</a></span>
+							<span className='theme'><a href='#' onClick={() => this.light()}>Light</a> - <a href='#' onClick={() => this.dark()}>Dark</a> - <a href='#' onClick={() => this.black()}>Black</a> - <a href='#' onClick={() => this.red()}>Red</a></span>
 							<br />
 							<span className='slogan'>Hacker News, Reddit, Lobsters, and Tildes articles rendered in reader mode.</span>
 						</p>
 						<Route path='/(|search)' component={Search} />
 						<Route path='/(|search)' component={Submit} />
+						{fullScreenAvailable &&
 							<Route path='/(|search)' render={() => !document.fullscreenElement ?
 								<button className='fullscreen' onClick={() => this.goFullScreen()}>Enter Fullscreen</button>
 							:
 								<button className='fullscreen' onClick={() => this.exitFullScreen()}>Exit Fullscreen</button>
 							} />
+						}
 					</div>

 					<Route path='/' exact render={(props) => <Feed {...props} updateCache={this.updateCache} />} />
--- a/webclient/src/Style-black.css
+++ b/webclient/src/Style-black.css
@@ -0,0 +1,68 @@
+.black {
+	color: #ddd;
+}
+
+.black a {
+	color: #ddd;
+}
+
+.black input {
+	color: #ddd;
+	border: 1px solid #828282;
+}
+
+.black button {
+	background-color: #444444;
+	border-color: #bbb;
+	color: #ddd;
+}
+
+.black .item {
+	color: #828282;
+}
+
+.black .item .source-logo {
+	filter: grayscale(1);
+}
+
+.black .item a {
+	color: #828282;
+}
+
+.black .item a.link {
+	color: #ddd;
+}
+.black .item a.link:visited {
+	color: #828282;
+}
+
+.black .item .info a.hot {
+	color: #cccccc;
+}
+
+.black .article a {
+	border-bottom: 1px solid #aaaaaa;
+}
+
+.black .article u {
+	border-bottom: 1px solid #aaaaaa;
+	text-decoration: none;
+}
+
+.black .story-text video,
+.black .story-text img {
+	filter: brightness(50%);
+}
+
+.black .article .info {
+	color: #828282;
+}
+
+.black .article .info a {
+	border-bottom: none;
+	color: #828282;
+}
+
+.black .comment.lined {
+	border-left: 1px solid #444444;
+}
--- a/webclient/src/Style-dark.css
+++ b/webclient/src/Style-dark.css
@@ -11,12 +11,14 @@
 	border: 1px solid #828282;
 }

-.dark .item {
-	color: #828282;
+.dark button {
+	background-color: #444444;
+	border-color: #bbb;
+	color: #ddd;
 }

-.dark .item .source-logo {
-	filter: grayscale(1);
+.dark .item {
+	color: #828282;
 }

 .dark .item a {
@@ -43,6 +45,7 @@
 	text-decoration: none;
 }

+.dark .story-text video,
 .dark .story-text img {
 	filter: brightness(50%);
 }
--- a/webclient/src/Style-red.css
+++ b/webclient/src/Style-red.css
@@ -59,6 +59,7 @@
 	text-decoration: none;
 }

+.red .story-text video,
 .red .story-text img {
 	filter: grayscale(100%) brightness(20%) sepia(100%) hue-rotate(-50deg) saturate(600%) contrast(0.8);
 }
Author	SHA1	Message	Date
Tanner Collin	9ec61ea5bc	Ignore dead and political stories	2025-05-27 18:47:17 +00:00
Tanner Collin	bdc7a6c10d	Fix Better HN api content extraction	2025-02-01 22:39:13 +00:00
Tanner Collin	4858516b01	Add Better HN as an API backup	2025-02-01 21:42:06 +00:00
Tanner Collin	f10e6063fc	Bug fixes	2025-02-01 20:31:35 +00:00
Tanner Collin	249a616531	Alert on story update error	2024-03-16 20:41:24 +00:00
Tanner Collin	ab92bd5441	Adjust score and comment thresholds	2024-03-08 03:08:18 +00:00
Tanner Collin	6b16a768a7	Fix deletion script	2024-03-08 03:08:03 +00:00
Tanner Collin	57de076fec	Increase database timeout	2024-02-27 18:48:56 +00:00
Tanner Collin	074b898508	Fix lobsters comment parsing	2024-02-27 18:47:00 +00:00
Tanner Collin	f049d194ab	Move scripts into own folder	2024-02-27 18:32:29 +00:00
Tanner Collin	c2b9a1cb7a	Update readability	2024-02-27 18:32:19 +00:00
Tanner Collin	4435f49e17	Make "dark" theme grey, add "black" theme	2023-09-13 01:19:47 +00:00
Tanner Collin	494d89ac30	Disable lobsters	2023-09-13 01:02:15 +00:00
Tanner Collin	e79fca6ecc	Replace "indent_level" with "depth" in lobsters API See: `fe09e5aa31`	2023-08-31 07:35:44 +00:00
Tanner Collin	c65fb69092	Handle Lobsters comment parsing TypeErrors Too lazy to debug this: 2023-08-29 12:56:35,111 - root - INFO - Updating lobsters story: yktkwr, index: 55 Traceback (most recent call last): File "src/gevent/greenlet.py", line 854, in gevent._gevent_cgreenlet.Greenlet.run File "/home/tanner/qotnews/apiserver/server.py", line 194, in feed_thread valid = feed.update_story(story) File "/home/tanner/qotnews/apiserver/feed.py", line 74, in update_story res = lobsters.story(story['ref']) File "/home/tanner/qotnews/apiserver/feeds/lobsters.py", line 103, in story s['comments'] = iter_comments(r['comments']) File "/home/tanner/qotnews/apiserver/feeds/lobsters.py", line 76, in iter_comments parent_stack = parent_stack[:indent-1] TypeError: unsupported operand type(s) for -: 'NoneType' and 'int' 2023-08-29T12:56:35Z <Greenlet at 0x7f92ad840ae0: feed_thread> failed with TypeError	2023-08-31 07:30:39 +00:00
Tanner Collin	632d028e4c	Add Tildes group whitelist	2023-07-13 22:54:36 +00:00
Tanner Collin	ea8e9e5a23	Increase again	2023-06-13 17:11:50 +00:00
Tanner Collin	2838ea9b41	Increase Tildes story score requirement	2023-06-11 01:01:31 +00:00
Tanner Collin	f15d108971	Catch all possible Reddit API exceptions	2023-03-15 21:16:37 +00:00
Tanner Collin	f777348af8	Fix darkmode fullscreen button color	2022-08-11 19:36:36 +00:00
Tanner Collin	486404a413	Fix fix-stories bug	2022-08-10 04:06:39 +00:00
Tanner Collin	7c9c07a4cf	Hide fullscreen button if it's not available	2022-08-10 04:05:25 +00:00