Compare commits

...

109 Commits

Author SHA1 Message Date
Jason Schwarzenberger
2439c113b3 update declutter. 2020-11-24 16:54:21 +13:00
Jason Schwarzenberger
0f5e28136d update declutter. 2020-11-24 11:20:37 +13:00
Jason Schwarzenberger
bb1413b586 sort substack feed by time. 2020-11-24 10:56:38 +13:00
Jason Schwarzenberger
0a27c0da1f update declutter. 2020-11-24 10:42:41 +13:00
Jason Schwarzenberger
fe01ea52e5 get favicons for custom substack publications. 2020-11-24 10:36:31 +13:00
Jason Schwarzenberger
3daae5fa1b change substack time parsing to misc.time 2020-11-23 16:46:54 +13:00
Jason Schwarzenberger
25caee17d6 add related stories to pre-fetch caching. 2020-11-23 16:26:37 +13:00
Jason Schwarzenberger
c1b6349771 namespace the refs for hn and substack. 2020-11-23 16:09:12 +13:00
Jason
54a4c7e55a fix with try-catch 2020-11-23 01:20:40 +00:00
Jason
b12a3570b0 add logging, extend id length 2020-11-21 21:21:31 +00:00
Jason Schwarzenberger
0bfa920654 fix mistake. 2020-11-20 04:29:54 +00:00
Jason Schwarzenberger
9341b4d966 fix mistake. 2020-11-20 04:27:28 +00:00
Jason Schwarzenberger
a2e5faa3b5 fix empty source links. 2020-11-20 17:02:09 +13:00
Jason Schwarzenberger
a86eb98c1a fix hn self posts related discussion. 2020-11-20 13:06:19 +13:00
Jason Schwarzenberger
abf7f0a802 force reader update in update-story.py 2020-11-20 12:21:27 +13:00
Jason Schwarzenberger
d288546d6f update declutter. 2020-11-20 11:51:56 +13:00
Jason Schwarzenberger
cc130942ca update declutter. 2020-11-20 11:48:46 +13:00
Jason Schwarzenberger
f0b14408d4 fix other discussions links. 2020-11-20 09:47:56 +13:00
Jason Schwarzenberger
e1830a589b wip on other discussions ui. 2020-11-19 17:27:00 +13:00
Jason Schwarzenberger
32bc3b906b add update-story.py 2020-11-19 15:06:55 +13:00
Jason Schwarzenberger
f5e65632b8 fix comment date. 2020-11-19 14:27:24 +13:00
Jason Schwarzenberger
1fe524207e stuff comments. 2020-11-19 14:23:01 +13:00
Jason Schwarzenberger
dc3d17b171 update declutter 2020-11-19 12:30:27 +13:00
Jason Schwarzenberger
539350a83d port separation. 2020-11-18 17:21:37 +13:00
Jason Schwarzenberger
2f730c1f52 update declutter. 2020-11-18 15:20:23 +13:00
Jason Schwarzenberger
e0960d59f3 update readme. 2020-11-18 13:26:34 +13:00
Jason Schwarzenberger
f5b38f5c6b remove readerserver, add declutter. 2020-11-18 12:59:35 +13:00
Jason Schwarzenberger
c9da2a078b increase setTimeouts. 2020-11-18 10:06:45 +13:00
Jason Schwarzenberger
78654e0c63 reduce setTimeout. 2020-11-17 16:07:33 +13:00
Jason Schwarzenberger
3b885e4327 renaming things. 2020-11-17 15:54:14 +13:00
Jason Schwarzenberger
55d50a86d8 hmmm 2020-11-17 15:13:38 +13:00
Jason Schwarzenberger
55e7f6bb14 cosmetic filters for newshub. 2020-11-17 15:01:12 +13:00
Jason Schwarzenberger
5668fa5dbc fix mistake. 2020-11-17 12:54:54 +13:00
Jason Schwarzenberger
b771b52501 add regex to get a unique ref from each sitemap/category based article url. 2020-11-17 12:38:28 +13:00
Jason Schwarzenberger
f5c7a658ba cosmetic filters for the spinoff. 2020-11-16 16:49:39 +13:00
Jason Schwarzenberger
f5ccd844da fix import error. 2020-11-16 15:41:09 +13:00
Jason Schwarzenberger
6a91b9402f split categories, sitemap and other crap out of news.py 2020-11-16 15:30:33 +13:00
Jason Schwarzenberger
b80c1a5cb5 extract story list item from Results and Feed. 2020-11-16 13:17:58 +13:00
Jason Schwarzenberger
b23e470317 move reddit thresholds as settings variables. 2020-11-16 10:11:39 +13:00
Jason Schwarzenberger
7420b5ece9 fix microdata multiple authors 2020-11-12 17:33:46 +13:00
Jason Schwarzenberger
64ced635cc fix mistake. 2020-11-12 17:15:29 +13:00
Jason Schwarzenberger
9318627f1b ability to pass in multiple site maps/category urls. 2020-11-12 17:11:51 +13:00
Jason Schwarzenberger
3d0a3f1577 support list based json-ld authors. 2020-11-12 15:08:23 +13:00
Jason Schwarzenberger
587b10c438 recursive sitemaps (sitemap indexes) 2020-11-12 14:56:46 +13:00
Jason
00954c6cac local browser scraper 2020-11-11 09:26:54 +00:00
Jason Schwarzenberger
637bc38476 fix mistake. 2020-11-11 17:21:31 +13:00
Jason Schwarzenberger
164b7e72c4 basically add declutter like capabilities. 2020-11-11 17:16:04 +13:00
Jason Schwarzenberger
3169af3002 hostname from settings. 2020-11-11 09:46:27 +13:00
Jason Schwarzenberger
d588a60930 add source to searchable attributes. 2020-11-11 09:37:54 +13:00
Jason Schwarzenberger
408e2870b2 tzinfo and microdata schema urls. 2020-11-10 16:51:27 +13:00
Jason Schwarzenberger
44b8b36547 add data cast in query. 2020-11-10 15:50:18 +13:00
Jason Schwarzenberger
4f49684194 remove logos from utils.js 2020-11-10 15:38:48 +13:00
Jason Schwarzenberger
1d78b1c592 fix favicon url. 2020-11-10 15:34:21 +13:00
Jason Schwarzenberger
0374794536 Sitemap and Category to get favicon into icon property of story. 2020-11-10 15:22:27 +13:00
Jason Schwarzenberger
943a1cfa4f reader server 2020-11-10 14:56:21 +13:00
Jason Schwarzenberger
9cee370a25 tvnz icon 2020-11-10 14:10:02 +13:00
Jason Schwarzenberger
5efc6ef2d3 add related stories (in api only) 2020-11-10 14:09:56 +13:00
Jason Schwarzenberger
4ec50e20cb feed thread loop. 2020-11-10 10:10:38 +13:00
Jason Schwarzenberger
c1b7877f4b remove limit. 2020-11-09 17:54:50 +13:00
Jason Schwarzenberger
7b8cbfc9b9 try to make feed only determined by the max age. 2020-11-09 17:50:58 +13:00
Jason Schwarzenberger
bfa4108a8e Merge remote-tracking branch 'tanner/master' 2020-11-09 16:08:28 +13:00
Jason Schwarzenberger
0bd0d40a31 use json type in sqlite. 2020-11-09 15:45:10 +13:00
Jason Schwarzenberger
4e04595415 fix search. 2020-11-09 15:44:44 +13:00
Jason
006db2960c change to 3 days 2020-11-09 01:36:51 +00:00
Jason Schwarzenberger
1f063f0dac undo log level change 2020-11-06 11:20:34 +13:00
Jason Schwarzenberger
1658346aa9 fix news.py feed. 2020-11-06 10:37:43 +13:00
Jason Schwarzenberger
2dbc702b40 switch to python-dateutil for parser, reverse sort xml feeds. 2020-11-06 10:02:39 +13:00
Jason Schwarzenberger
1c4764e67d sort sitemap feed by lastmod time. 2020-11-06 09:30:15 +13:00
Jason
ee49d2021e newsroom 2020-11-05 20:28:55 +00:00
Jason
c391c50ab1 use localize 2020-11-05 04:15:31 +00:00
Jason Schwarzenberger
095f0d549a use replace. 2020-11-05 16:57:08 +13:00
Jason Schwarzenberger
c21c71667e fix date issue. 2020-11-05 16:41:15 +13:00
Jason Schwarzenberger
c3a2c91a11 update requirements.txt 2020-11-05 16:33:50 +13:00
Jason Schwarzenberger
0f39446a61 tz aware for use in settings. 2020-11-05 16:30:55 +13:00
Jason Schwarzenberger
351059aab1 fix excludes. 2020-11-05 15:59:13 +13:00
Jason Schwarzenberger
4488e2c292 add an excludes list of substrings for urls in the settings for sitemap/category. 2020-11-05 15:51:59 +13:00
Jason Schwarzenberger
afda5b635c disqus test. 2020-11-05 14:23:51 +13:00
Jason Schwarzenberger
0fc1a44d2b fix issue in substack. 2020-11-04 17:40:29 +13:00
Jason Schwarzenberger
9fff1b9e46 avoid duplicate articles listed on the category page 2020-11-04 17:14:42 +13:00
Jason Schwarzenberger
16b59f6c67 try stop bad pages. 2020-11-04 16:34:31 +13:00
Jason Schwarzenberger
939f4775a7 better settings example. 2020-11-04 15:52:34 +13:00
Jason Schwarzenberger
9bfc6fc6fa scraper settings, ordering and loop. 2020-11-04 15:47:12 +13:00
Jason Schwarzenberger
6ea9844d00 remove useless try blocks. 2020-11-04 15:37:19 +13:00
Jason Schwarzenberger
1318259d3d imply referrer is substack. 2020-11-04 15:21:07 +13:00
Jason Schwarzenberger
98a0c2257c increase declutter timeout. 2020-11-04 15:15:00 +13:00
Jason Schwarzenberger
e6976db25d fix tabs 2020-11-04 15:04:20 +13:00
Jason Schwarzenberger
9edc8b7cca move scraping for article content to files. 2020-11-04 15:00:58 +13:00
Jason Schwarzenberger
33e21e7f30 fix mistake. 2020-11-04 12:45:01 +13:00
Jason Schwarzenberger
892a99eca6 add + expander in place of collapser. 2020-11-04 12:43:15 +13:00
Jason Schwarzenberger
d718d05a04 fix dates for newsroom. 2020-11-04 11:53:16 +13:00
Jason Schwarzenberger
d1795eb1b8 add radionz and newsroom logos. 2020-11-04 11:30:56 +13:00
Jason Schwarzenberger
9f4ff4acf0 remove unnecessary sitemap.xml request. 2020-11-04 11:22:15 +13:00
Jason Schwarzenberger
db6aad84ec fix mistake. 2020-11-04 11:12:01 +13:00
Jason Schwarzenberger
29f8a8b8cc add news site categories feed. 2020-11-04 11:08:50 +13:00
Jason
abf8589e02 fix sitemap 2020-11-03 10:53:40 +00:00
Jason
b759f46582 use extruct for opengraph/json-ld/microdata of articles 2020-11-03 10:31:36 +00:00
Jason Schwarzenberger
736cdc8576 fix mistake. 2020-11-03 17:04:46 +13:00
Jason Schwarzenberger
244d416f6e settings config of sitemap/substack publications. 2020-11-03 17:01:29 +13:00
Jason Schwarzenberger
5f98a2e76a Merge remote-tracking branch 'tanner/master' into master
And adding relevant setings.py.example/etc.
2020-11-03 16:44:02 +13:00
Jason Schwarzenberger
0567cdfd9b move sort to render. 2020-11-03 16:30:22 +13:00
Jason Schwarzenberger
4f90671cec order feed by reverse chronological 2020-11-03 16:21:23 +13:00
Jason Schwarzenberger
e63a1456a5 add logos. 2020-11-03 16:07:07 +13:00
Jason Schwarzenberger
76f1d57702 sitemap based feed. 2020-11-03 16:00:03 +13:00
Jason Schwarzenberger
de80389ed0 add logos. 2020-11-03 12:48:19 +13:00
Jason Schwarzenberger
4e64cf682a add the bulletin. 2020-11-03 12:41:16 +13:00
Jason Schwarzenberger
c5fe5d25a0 add substack.py top sites, replacing webworm.py 2020-11-03 12:28:39 +13:00
Jason
283a2b1545 fix webworm comments 2020-11-02 22:06:43 +00:00
Jason Schwarzenberger
0d6a86ace2 fix webworm dates. 2020-11-03 10:31:14 +13:00
Jason Schwarzenberger
f23bf628e0 add webworm/substack as a feed. 2020-11-02 17:09:59 +13:00
40 changed files with 4303 additions and 3885 deletions

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "readerserver"]
path = readerserver
url = https://github.com/master5o1/declutter.git

View File

@@ -20,7 +20,7 @@ $ sudo apt install yarn
Clone this repo: Clone this repo:
```text ```text
$ git clone https://gogs.tannercollin.com/tanner/qotnews.git $ git clone --recurse-submodules https://git.1j.nz/jason/qotnews.git
$ cd qotnews $ cd qotnews
``` ```
@@ -37,14 +37,14 @@ $ source env/bin/activate
Configure Praw for your Reddit account (optional): Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps - Go to https://www.reddit.com/prefs/apps
* Click "Create app" - Click "Create app"
* Name: whatever - Name: whatever
* App type: script - App type: script
* Description: blank - Description: blank
* About URL: blank - About URL: blank
* Redirect URL: your GitHub profile - Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `settings.py` below - Submit, copy the client ID and client secret into `settings.py` below
```text ```text
(env) $ vim settings.py.example (env) $ vim settings.py.example
@@ -109,7 +109,7 @@ stdout_logfile_maxbytes=1MB
[program:qotnewsreader] [program:qotnewsreader]
user=qotnews user=qotnews
directory=/home/qotnews/qotnews/readerserver directory=/home/qotnews/qotnews/readerserver
command=node main.js command=node index.js
autostart=true autostart=true
autorestart=true autorestart=true
stderr_logfile=/var/log/qotnewsreader.log stderr_logfile=/var/log/qotnewsreader.log

View File

@@ -1,9 +1,9 @@
import json from datetime import datetime, timedelta
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.types import JSON
engine = create_engine('sqlite:///data/qotnews.sqlite') engine = create_engine('sqlite:///data/qotnews.sqlite')
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
@@ -15,8 +15,8 @@ class Story(Base):
sid = Column(String(16), primary_key=True) sid = Column(String(16), primary_key=True)
ref = Column(String(16), unique=True) ref = Column(String(16), unique=True)
meta_json = Column(String) meta = Column(JSON)
full_json = Column(String) data = Column(JSON)
title = Column(String) title = Column(String)
class Reflist(Base): class Reflist(Base):
@@ -24,6 +24,7 @@ class Reflist(Base):
rid = Column(Integer, primary_key=True) rid = Column(Integer, primary_key=True)
ref = Column(String(16), unique=True) ref = Column(String(16), unique=True)
urlref = Column(String)
sid = Column(String, ForeignKey('stories.sid'), unique=True) sid = Column(String, ForeignKey('stories.sid'), unique=True)
source = Column(String(16)) source = Column(String(16))
@@ -36,19 +37,21 @@ def get_story(sid):
def put_story(story): def put_story(story):
story = story.copy() story = story.copy()
full_json = json.dumps(story) data = {}
data.update(story)
story.pop('text', None) meta = {}
story.pop('comments', None) meta.update(story)
meta_json = json.dumps(story) meta.pop('text', None)
meta.pop('comments', None)
try: try:
session = Session() session = Session()
s = Story( s = Story(
sid=story['id'], sid=story['id'],
ref=story['ref'], ref=story['ref'],
full_json=full_json, data=data,
meta_json=meta_json, meta=meta,
title=story.get('title', None), title=story.get('title', None),
) )
session.merge(s) session.merge(s)
@@ -63,24 +66,39 @@ def get_story_by_ref(ref):
session = Session() session = Session()
return session.query(Story).filter(Story.ref==ref).first() return session.query(Story).filter(Story.ref==ref).first()
def get_reflist(amount): def get_stories_by_url(url):
session = Session() session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount) return session.query(Story).\
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_stories(amount): def get_ref_by_sid(sid):
session = Session() session = Session()
q = session.query(Reflist, Story.meta_json).\ x = session.query(Reflist).\
order_by(Reflist.rid.desc()).\ filter(Reflist.sid == sid).\
first()
return dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref)
def get_reflist():
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]
def get_stories(maxage=60*60*24*2):
time = datetime.now().timestamp() - maxage
session = Session()
q = session.query(Reflist, Story.meta).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
limit(amount) filter(Story.meta['date'].as_integer() > time).\
order_by(Story.meta['date'].desc())
return [x[1] for x in q] return [x[1] for x in q]
def put_ref(ref, sid, source): def put_ref(ref, sid, source, urlref):
try: try:
session = Session() session = Session()
r = Reflist(ref=ref, sid=sid, source=source) r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
session.add(r) session.add(r)
session.commit() session.commit()
except: except:

View File

@@ -6,61 +6,84 @@ logging.basicConfig(
import requests import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import itertools
import settings import settings
from feeds import hackernews, reddit, tildes, manual from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
OUTLINE_API = 'https://api.outline.com/v3/parse_article' from feeds.category import Category
READ_API = 'http://127.0.0.1:33843' from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2
def list(): substacks = {}
feed = [] for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = Category(value)
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = Sitemap(value)
def get_list():
feeds = {}
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT: if settings.NUM_REDDIT:
feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES: if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count']
feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes')
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed return feed
def get_article(url): def get_article(url):
try: scrapers = {
params = {'source_url': url} 'headless': headless,
headers = {'Referer': 'https://outline.com/'} 'simple': simple,
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) 'outline': outline,
if r.status_code == 429: 'declutter': declutter,
logging.info('Rate limited by outline, sleeping 30s and skipping...') }
time.sleep(30) available = settings.SCRAPERS or ['headless', 'simple']
return '' if 'simple' not in available:
if r.status_code != 200: available += ['simple']
raise Exception('Bad response code ' + str(r.status_code))
html = r.json()['data']['html']
if 'URL is not supported by Outline' in html:
raise Exception('URL not supported by Outline')
return html
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
logging.info('Trying our server instead...') for scraper in available:
if scraper not in scrapers.keys():
try: continue
r = requests.post(READ_API, data=dict(url=url), timeout=20) try:
if r.status_code != 200: html = scrapers[scraper].get_html(url)
raise Exception('Bad response code ' + str(r.status_code)) if html:
return r.text return html
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except:
logging.error('Problem getting article: {}'.format(str(e))) pass
return '' return ''
def get_content_type(url): def get_content_type(url):
try: try:
@@ -78,7 +101,7 @@ def get_content_type(url):
except: except:
return '' return ''
def update_story(story, is_manual=False): def update_story(story, is_manual=False, urlref=None):
res = {} res = {}
if story['source'] == 'hackernews': if story['source'] == 'hackernews':
@@ -87,6 +110,14 @@ def update_story(story, is_manual=False):
res = reddit.story(story['ref']) res = reddit.story(story['ref'])
elif story['source'] == 'tildes': elif story['source'] == 'tildes':
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'substack':
res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'], urlref)
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'], urlref)
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual': elif story['source'] == 'manual':
res = manual.story(story['ref']) res = manual.story(story['ref'])
@@ -96,7 +127,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

View File

@@ -0,0 +1,72 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.api import xml
from misc.news import Base
def _filter_links(links, category_url, excludes=None):
links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
links = list(filter(None, [link if link != category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_category(category_url, excludes=None):
base_url = '/'.join(category_url.split('/')[:3])
markup = xml(lambda x: category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
links = _filter_links(links, category_url, excludes)
return links
class Category(Base):
def __init__(self, config):
self.config = config
self.category_url = config.get('url')
self.tz = config.get('tz')
def feed(self, excludes=None):
links = []
if isinstance(self.category_url, str):
links += _get_category(self.category_url, excludes)
elif isinstance(self.category_url, list):
for url in self.category_url:
links += _get_category(url, excludes)
links = list(set(links))
return [(self.get_id(link), link) for link in links]
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Category: RadioNZ")
site = Category({ 'url': "https://www.rnz.co.nz/news/" })
excludes = [
'rnz.co.nz/news/sport',
'rnz.co.nz/weather',
'rnz.co.nz/news/weather',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0][0], posts[0][1]))
print("Category: Newsroom")
site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'})
posts = site.feed()
print(posts[:5])
print(site.story(posts[0][0], posts[0][1]))

View File

@@ -40,7 +40,7 @@ def api(route, ref=None):
return False return False
def feed(): def feed():
return [str(x) for x in api(API_TOPSTORIES) or []] return ['hn:'+str(x) for x in api(API_TOPSTORIES) or []]
def comment(i): def comment(i):
if 'author' not in i: if 'author' not in i:
@@ -60,6 +60,7 @@ def comment_count(i):
return sum([comment_count(c) for c in i['comments']]) + alive return sum([comment_count(c) for c in i['comments']]) + alive
def story(ref): def story(ref):
ref = ref.replace('hn:', '')
r = api(API_ITEM, ref) r = api(API_ITEM, ref)
if not r: return False if not r: return False

View File

@@ -7,6 +7,8 @@ import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import settings
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def api(route): def api(route):
@@ -33,7 +35,7 @@ def story(ref):
s = {} s = {}
s['author'] = 'manual submission' s['author'] = 'manual submission'
s['author_link'] = 'https://news.t0.vc' s['author_link'] = 'https://{}'.format(settings.HOSTNAME)
s['score'] = 0 s['score'] = 0
s['date'] = int(time.time()) s['date'] = int(time.time())
s['title'] = str(soup.title.string) if soup.title else ref s['title'] = str(soup.title.string) if soup.title else ref

View File

@@ -73,7 +73,7 @@ def story(ref):
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.num_comments s['num_comments'] = r.num_comments
if s['score'] < 25 and s['num_comments'] < 10: if s['score'] < settings.REDDIT_SCORE_THRESHOLD and s['num_comments'] < settings.REDDIT_COMMENT_THRESHOLD:
return False return False
if r.selftext: if r.selftext:

101
apiserver/feeds/sitemap.py Normal file
View File

@@ -0,0 +1,101 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from datetime import datetime
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.time import unix
from misc.api import xml
from misc.news import Base
def _get_sitemap_date(a):
if a.find('lastmod'):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return ''
def _filter_links(links, excludes=None):
too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_sitemap(feed_url, excludes=None):
markup = xml(lambda x: feed_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
links = []
feed_urls = []
if soup.find('sitemapindex'):
sitemap = soup.find('sitemapindex').findAll('sitemap')
feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
if soup.find('urlset'):
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
feed_urls = _filter_links(feed_urls, excludes)
links = _filter_links(links, excludes)
for url in feed_urls:
links += _get_sitemap(url, excludes)
return list(set(links))
class Sitemap(Base):
def __init__(self, config):
self.config = config
self.sitemap_url = config.get('url')
self.tz = config.get('tz')
def feed(self, excludes=None):
links = []
if isinstance(self.sitemap_url, str):
links += _get_sitemap(self.sitemap_url, excludes)
elif isinstance(self.sitemap_url, list):
for url in self.sitemap_url:
links += _get_sitemap(url, excludes)
links = list(set(links))
return [(self.get_id(link), link) for link in links]
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: The Spinoff")
site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" })
excludes = [
'thespinoff.co.nz/sitemap-misc.xml',
'thespinoff.co.nz/sitemap-authors.xml',
'thespinoff.co.nz/sitemap-tax-category.xml',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0][0], posts[0][1]))
print("Sitemap: Newshub")
site = Sitemap({
'url': [
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
],
})
posts = site.feed()
print(posts[:5])
print(site.story(posts[0][0], posts[0][1]))

174
apiserver/feeds/substack.py Normal file
View File

@@ -0,0 +1,174 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
import settings
from misc.time import unix
from misc.metadata import get_icons
from misc.api import xml, json
from utils import clean
SUBSTACK_REFERER = 'https://substack.com'
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
def author_link(author_id, base_url):
return f"{base_url}/people/{author_id}"
def api_comments(post_id, base_url):
return f"{base_url}/api/v1/post/{post_id}/comments?all_comments=true&sort=best_first"
def api_stories(x, base_url):
return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100"
def comment(i):
if 'body' not in i:
return False
c = {}
c['date'] = unix(i.get('date'))
c['author'] = i.get('name', '')
c['score'] = i.get('reactions').get('')
c['text'] = clean(i.get('body', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
class Publication:
def __init__(self, domain):
self.BASE_DOMAIN = domain
def ref_prefix(self, ref):
return f"{self.BASE_DOMAIN}/#id:{ref}"
def strip_ref_prefix(self, ref):
return ref.replace(f"{self.BASE_DOMAIN}/#id:", '')
def feed(self):
too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN})
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if unix(i.get('post_date')) > too_old else None for i in stories]))
stories.sort(key=lambda a: unix(a.get('post_date')), reverse=True)
return [self.ref_prefix(str(i.get("id"))) for i in stories or []]
def story(self, ref):
ref = self.strip_ref_prefix(ref)
stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN})
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
if len(stories) == 0:
return False
r = stories[0]
if not r:
return False
s = {}
s['author'] = ''
s['author_link'] = ''
s['date'] = unix(r.get('post_date'))
s['score'] = r.get('reactions').get('')
s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '')
comments = json(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), headers={'Referer': self.BASE_DOMAIN})
s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0)
authors = list(filter(None, [self._bylines(byline) for byline in r.get('publishedBylines')]))
if len(authors):
s['author'] = authors[0].get('name')
s['author_link'] = authors[0].get('link')
markup = xml(lambda x: s['link'])
if markup:
icons = get_icons(markup, url=s['link'])
if icons:
s['icon'] = icons[0]
return s
def _bylines(self, b):
if 'id' not in b:
return None
a = {}
a['name'] = b.get('name')
a['link'] = author_link(b.get('id'), self.BASE_DOMAIN)
return a
class Top:
def ref_prefix(self, base_url, ref):
return f"{base_url}/#id:{ref}"
def strip_ref_prefix(self, ref):
if '/#id:' in ref:
base_url, item = ref.split(f"/#id:")
return item
return ref
def feed(self):
too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER})
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if unix(i.get('post_date')) > too_old else None for i in stories]))
stories.sort(key=lambda a: unix(a.get('post_date')), reverse=True)
stories = [self.ref_prefix(str(i.get("pub").get("base_url")), str(i.get("id"))) for i in stories]
return stories
def story(self, ref):
ref = self.strip_ref_prefix(ref)
stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER})
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
if len(stories) == 0:
return False
r = stories[0]
if not r:
return False
s = {}
pub = r.get('pub')
base_url = pub.get('base_url')
s['author'] = pub.get('author_name')
s['author_link'] = author_link(pub.get('author_id'), base_url)
s['date'] = unix(r.get('post_date'))
s['score'] = r.get('score')
s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '')
comments = json(lambda x: api_comments(x, base_url), r.get('id'), headers={'Referer': SUBSTACK_REFERER})
s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0)
return s
top = Top()
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
top_posts = top.feed()
print(top.story(top_posts[0]))
webworm = Publication("https://www.webworm.co/")
posts = webworm.feed()
print(webworm.story(posts[0]))

40
apiserver/misc/api.py Normal file
View File

@@ -0,0 +1,40 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
GOOGLEBOT_USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
GOOGLEBOT_IP = '66.249.66.1'
TIMEOUT = 30
def xml(route, ref=None, headers=dict(), use_googlebot=True):
try:
if use_googlebot:
headers['User-Agent'] = GOOGLEBOT_USER_AGENT
headers['X-Forwarded-For'] = GOOGLEBOT_IP
r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def json(route, ref=None, headers=dict(), use_googlebot=True):
try:
if use_googlebot:
headers['User-Agent'] = GOOGLEBOT_USER_AGENT
headers['X-Forwarded-For'] = GOOGLEBOT_IP
r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False

14
apiserver/misc/icons.py Normal file
View File

@@ -0,0 +1,14 @@
from bs4 import BeautifulSoup
def get_icons(markup):
soup = BeautifulSoup(markup, features='html.parser')
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
favicon = soup.find_all('link', rel="shortcut icon", href=True)
others = soup.find_all('link', rel="icon", href=True)
icons = icon32 + icon16 + favicon + others
base_url = '/'.join(urlref.split('/')[:3])
icons = list(set([i.get('href') for i in icons]))
icons = [i if i.startswith('http') else base_url + i for i in icons]
return icons

View File

@@ -0,0 +1,84 @@
from bs4 import BeautifulSoup
def get_icons(markup, url):
soup = BeautifulSoup(markup, features='html.parser')
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
favicon = soup.find_all('link', rel="shortcut icon", href=True)
others = soup.find_all('link', rel="icon", href=True)
icons = icon32 + icon16 + favicon + others
base_url = '/'.join(url.split('/')[:3])
icons = list(set([i.get('href') for i in icons]))
icons = [i if i.startswith('http') else base_url + i for i in icons]
return icons
def parse_extruct(s, data):
rdfa_keys = {
'title': [
'http://ogp.me/ns#title',
'https://ogp.me/ns#title',
],
'date': [
'http://ogp.me/ns/article#modified_time',
'https://ogp.me/ns/article#modified_time',
'http://ogp.me/ns/article#published_time',
'https://ogp.me/ns/article#published_time',
]
}
for rdfa in data['rdfa']:
for key, props in rdfa.items():
for attribute, properties in rdfa_keys.items():
for prop in properties:
if prop in props:
for values in props[prop]:
s[attribute] = values['@value']
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = modified[0]
if len(published):
s['date'] = published[0]
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = props['dateModified']
if props['datePublished']:
s['date'] = props['datePublished']
if 'author' in props and props['author']:
if 'properties' in props['author']:
s['author'] = props['author']['properties']['name']
elif isinstance(props['author'], list):
s['author'] = props['author'][0]['properties']['name']
for ld in data['json-ld']:
if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = ld['dateModified']
if ld['datePublished']:
s['date'] = ld['datePublished']
if 'author' in ld and ld['author']:
if 'name' in ld['author']:
s['author'] = ld['author']['name']
elif isinstance(ld['author'], list):
s['author'] = ld['author'][0]['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s

98
apiserver/misc/news.py Normal file
View File

@@ -0,0 +1,98 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import re
import requests
from bs4 import BeautifulSoup
from scrapers import declutter
import extruct
import settings
from utils import clean
from misc.metadata import parse_extruct, get_icons
from misc.time import unix
from misc.api import xml
import misc.stuff as stuff
def comment(i):
if 'author' not in i:
return False
c = {}
c['author'] = i.get('author', '')
c['score'] = i.get('points', 0)
c['date'] = unix(i.get('date', 0))
c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class Base:
def __init__(config):
self.config = config
self.url = config.get('url')
self.tz = config.get('tz')
def get_id(self, link):
patterns = self.config.get('patterns')
if not patterns:
return link
patterns = [re.compile(p) for p in patterns]
patterns = list(filter(None, [p.match(link) for p in patterns]))
patterns = list(set([':'.join(p.groups()) for p in patterns]))
if not patterns:
return link
return patterns[0]
def feed(self, excludes=None):
return []
def story(self, ref, urlref):
if urlref is None:
return False
markup = xml(lambda x: urlref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = urlref
s['url'] = urlref
s['date'] = 0
icons = get_icons(markup, url=urlref)
if icons:
s['icon'] = icons[0]
data = extruct.extract(markup)
s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(urlref)
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if urlref.startswith('https://www.stuff.co.nz'):
s['comments'] = stuff.get_comments(urlref)
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = len(s['comments'])
if not s['date']:
return False
return s

64
apiserver/misc/stuff.py Normal file
View File

@@ -0,0 +1,64 @@
import re
from bs4 import BeautifulSoup
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from misc.time import unix
from misc.api import xml
def _soup_get_text(soup):
if not soup: return None
if soup.text: return soup.text
s = soup.find(text=lambda tag: isinstance(tag, bs4.CData))
if s and s.string: return s.string.strip()
return None
def _parse_comment(soup):
c = {
'author': '',
'authorLink': '',
'score': 0,
'date': 0,
'text': '',
'comments': [],
}
if soup.find('link'):
title = _soup_get_text(soup.find('link'))
if title and 'By:' in title:
c['author'] = title.strip('By:').strip()
if soup.find('dc:creator'):
c['author'] = _soup_get_text(soup.find('dc:creator'))
if soup.find('link'):
c['authorLink'] = _soup_get_text(soup.find('link'))
if soup.find('description'):
c['text'] = _soup_get_text(soup.find('description'))
if soup.find('pubdate'):
c['date'] = unix(soup.find('pubdate').text)
elif soup.find('pubDate'):
c['date'] = unix(soup.find('pubDate').text)
return c
def get_comments(url):
regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+"
p = re.compile(regex).match(url)
path = p.groups()[0]
comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}'
markup = xml(lambda x: comment_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
comments = soup.find_all('item')
if not comments: return []
comments = [_parse_comment(c) for c in comments]
return comments
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing')
print(len(comments))
print(comments[:5])

18
apiserver/misc/time.py Normal file
View File

@@ -0,0 +1,18 @@
import pytz
import dateutil.parser
TZINFOS = {
'NZDT': pytz.timezone('Pacific/Auckland'),
'NZST': pytz.timezone('Pacific/Auckland')
}
def unix(date_str, tz=None, tzinfos=TZINFOS):
try:
dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
if tz:
dt = pytz.timezone(tz).localize(dt)
return int(dt.timestamp())
except:
pass
return 0

View File

@@ -4,6 +4,7 @@ certifi==2020.6.20
chardet==3.0.4 chardet==3.0.4
click==7.1.2 click==7.1.2
commonmark==0.9.1 commonmark==0.9.1
extruct==0.10.0
Flask==1.1.2 Flask==1.1.2
Flask-Cors==3.0.8 Flask-Cors==3.0.8
gevent==20.6.2 gevent==20.6.2
@@ -11,11 +12,13 @@ greenlet==0.4.16
idna==2.10 idna==2.10
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.11.2 Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
packaging==20.4 packaging==20.4
praw==6.4.0 praw==6.4.0
prawcore==1.4.0 prawcore==1.4.0
pyparsing==2.4.7 pyparsing==2.4.7
pytz==2020.4
requests==2.24.0 requests==2.24.0
six==1.15.0 six==1.15.0
soupsieve==2.0.1 soupsieve==2.0.1
@@ -27,3 +30,4 @@ websocket-client==0.57.0
Werkzeug==1.0.1 Werkzeug==1.0.1
zope.event==4.4 zope.event==4.4
zope.interface==5.1.0 zope.interface==5.1.0
python-dateutil==2.8.1

View File

@@ -0,0 +1,41 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/headless/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments'
TIMEOUT = 90
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@@ -0,0 +1,41 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import HEADLESS_READER_PORT
READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843)
TIMEOUT = 90
def get_html(url):
logging.info(f"Headless Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@@ -0,0 +1,37 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
TIMEOUT = 20
def get_html(url):
details = get_details(url)
if not details:
return ''
return details['html']
def get_details(url):
try:
logging.info(f"Outline Scraper: {url}")
params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return data
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return None

View File

@@ -0,0 +1,28 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import SIMPLE_READER_PORT
READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843)
TIMEOUT = 20
def get_html(url):
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

View File

@@ -35,14 +35,11 @@ def update_rankings():
def update_attributes(): def update_attributes():
try: try:
json = ['title', 'url', 'author', 'link', 'id'] json = ['title', 'url', 'author', 'link', 'id', 'source']
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2) r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
if r.status_code != 202: if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise

View File

@@ -15,6 +15,7 @@ import traceback
import time import time
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import settings
import database import database
import search import search
import feed import feed
@@ -27,9 +28,6 @@ from flask_cors import CORS
database.init() database.init()
search.init() search.init()
FEED_LENGTH = 75
news_index = 0
def new_id(): def new_id():
nid = gen_rand_id() nid = gen_rand_id()
while database.get_story(nid): while database.get_story(nid):
@@ -42,9 +40,8 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(FEED_LENGTH) stories = database.get_stories(settings.MAX_STORY_AGE)
# hacky nested json res = Response(json.dumps({"stories": stories}))
res = Response('{"stories":[' + ','.join(stories) + ']}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@@ -73,7 +70,7 @@ def submit():
elif 'reddit.com' in parse.hostname and 'comments' in url: elif 'reddit.com' in parse.hostname and 'comments' in url:
source = 'reddit' source = 'reddit'
ref = parse.path.split('/')[4] ref = parse.path.split('/')[4]
elif 'news.t0.vc' in parse.hostname: elif settings.HOSTNAME in parse.hostname:
raise Exception('Invalid article') raise Exception('Invalid article')
else: else:
source = 'manual' source = 'manual'
@@ -102,8 +99,11 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
# hacky nested json related = []
res = Response('{"story":' + story.full_json + '}') if story.meta['url']:
related = database.get_stories_by_url(story.meta['url'])
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@@ -114,7 +114,7 @@ def story(sid):
def index(): def index():
return render_template('index.html', return render_template('index.html',
title='Feed', title='Feed',
url='news.t0.vc', url=settings.HOSTNAME,
description='Reddit, Hacker News, and Tildes combined, then pre-rendered in reader mode') description='Reddit, Hacker News, and Tildes combined, then pre-rendered in reader mode')
@flask_app.route('/<sid>', strict_slashes=False) @flask_app.route('/<sid>', strict_slashes=False)
@@ -127,7 +127,7 @@ def static_story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if not story: return abort(404) if not story: return abort(404)
story = json.loads(story.full_json) story = story.data
score = story['score'] score = story['score']
num_comments = story['num_comments'] num_comments = story['num_comments']
@@ -144,54 +144,55 @@ def static_story(sid):
url=url, url=url,
description=description) description=description)
http_server = WSGIServer(('', 33842), flask_app) http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app)
def _add_new_refs():
for ref, source, urlref in feed.get_list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source, urlref)
logging.info('Added ref ' + ref)
except database.IntegrityError:
logging.info('Unable to add ref ' + ref)
continue
def _update_current_story(item):
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story, urlref=item['urlref'])
if valid:
try:
database.put_story(story)
search.put_story(story)
except database.IntegrityError:
logging.info('Unable to add story with ref ' + ref)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
def feed_thread(): def feed_thread():
global news_index ref_list = []
try: try:
while True: while True:
# onboard new stories # onboard new stories
if news_index == 0: if not len(ref_list):
for ref, source in feed.list(): _add_new_refs()
if database.get_story_by_ref(ref): ref_list = database.get_reflist()
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
ref_list = database.get_reflist(FEED_LENGTH)
# update current stories # update current stories
if news_index < len(ref_list): if len(ref_list):
item = ref_list[news_index] item = ref_list.pop(0)
_update_current_story(item)
try:
story_json = database.get_story(item['sid']).full_json
story = json.loads(story_json)
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
else:
logging.info('Skipping index: ' + str(news_index))
gevent.sleep(6) gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt: except KeyboardInterrupt:
logging.info('Ending feed thread...') logging.info('Ending feed thread...')
except ValueError as e: except ValueError as e:

View File

@@ -1,12 +1,60 @@
# QotNews settings # QotNews settings
# edit this file and save it as settings.py # edit this file and save it as settings.py
HOSTNAME = 'news.t0.vc'
MAX_STORY_AGE = 3*24*60*60
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
API_PORT = 33842
SIMPLE_READER_PORT = 33843
HEADLESS_READER_PORT = 33843
# Feed Lengths # Feed Lengths
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site
NUM_HACKERNEWS = 15 NUM_HACKERNEWS = 15
NUM_REDDIT = 10 NUM_REDDIT = 10
NUM_TILDES = 5 NUM_TILDES = 5
NUM_SUBSTACK = 10
SITEMAP = {}
# SITEMAP['nzherald'] = {
# 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
# 'count': 20,
# 'patterns': [
# r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
# ],
# 'excludes': [
# 'driven.co.nz',
# 'oneroof.co.nz',
# 'nzherald.co.nz/sponsored-stories',
# 'nzherald.co.nz/entertainment/',
# 'nzherald.co.nz/lifestyle/',
# 'nzherald.co.nz/travel/',
# 'nzherald.co.nz/sport/',
# 'nzherald.co.nz/promotions/',
# 'nzherald.co.nzhttp',
# 'herald-afternoon-quiz',
# 'herald-morning-quiz'
# ],
# }
SUBSTACK = {}
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
CATEGORY = {}
# CATEGORY['radionz'] = {
# 'url': "https://www.rnz.co.nz/news/",
# 'count': 20,
# 'patterns': [
# r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
# ],
# 'excludes': [
# 'rnz.co.nz/news/sport',
# 'rnz.co.nz/weather',
# ],
# }
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit
@@ -14,6 +62,10 @@ REDDIT_CLIENT_ID = ''
REDDIT_CLIENT_SECRET = '' REDDIT_CLIENT_SECRET = ''
REDDIT_USER_AGENT = '' REDDIT_USER_AGENT = ''
# Minimum points or number of comments before including a thread:
REDDIT_COMMENT_THRESHOLD = 10
REDDIT_SCORE_THRESHOLD = 25
SUBREDDITS = [ SUBREDDITS = [
'Economics', 'Economics',
'AcademicPhilosophy', 'AcademicPhilosophy',

48
apiserver/update-story.py Normal file
View File

@@ -0,0 +1,48 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO)
import sys
import json
import requests
import database
import feed
import search
database.init()
search.init()
def _update_current_story(story, item):
logging.info('Updating story: {}'.format(str(story['ref'])))
if story.get('url', ''):
story['text'] = ''
valid = feed.update_story(story, urlref=item['urlref'])
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
if __name__ == '__main__':
if len(sys.argv) == 2:
sid = sys.argv[1]
else:
print('Usage: python delete-story.py [story id]')
exit(1)
item = database.get_ref_by_sid(sid)
if item:
story = database.get_story(item['sid']).data
if story:
print('Updating story:')
_update_current_story(story, item)
else:
print('Story not found. Exiting.')
else:
print('Story not found. Exiting.')

View File

@@ -9,7 +9,7 @@ import string
from bleach.sanitizer import Cleaner from bleach.sanitizer import Cleaner
def gen_rand_id(): def gen_rand_id():
return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) return ''.join(random.choice(string.ascii_uppercase) for _ in range(5))
def render_md(md): def render_md(md):
if md: if md:

1
readerserver Submodule

Submodule readerserver added at d3d5fc74ac

View File

@@ -1,92 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Editor
*.swp
*.swo

View File

@@ -1,53 +0,0 @@
const express = require('express');
const app = express();
const port = 33843;
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>');
});
const requestCallback = (url, res) => (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log('Response OK.');
const doc = new JSDOM(body, {url: url});
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (article && article.content) {
res.send(article.content);
} else {
res.sendStatus(404);
}
} else {
console.log('Response error:', error ? error.toString() : response.statusCode);
res.sendStatus(response ? response.statusCode : 404);
}
};
app.post('/', (req, res) => {
const url = req.body.url;
const requestOptions = {
url: url,
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
//headers: {'User-Agent': 'Twitterbot/1.0'},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Forwarded-For': '66.249.66.1',
},
};
console.log('Parse request for:', url);
request(requestOptions, requestCallback(url, res));
});
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);
});

View File

@@ -1,13 +0,0 @@
{
"name": "readerserver",
"version": "1.0.0",
"main": "main.js",
"license": "MIT",
"dependencies": {
"dompurify": "^1.0.11",
"express": "^4.17.1",
"jsdom": "^15.1.1",
"readability": "https://github.com/mozilla/readability",
"request": "^2.88.0"
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,13 +5,14 @@ import './Style-light.css';
import './Style-dark.css'; import './Style-dark.css';
import './fonts/Fonts.css'; import './fonts/Fonts.css';
import { ForwardDot } from './utils.js'; import { ForwardDot } from './utils.js';
import Feed from './Feed.js';
import Article from './Article.js';
import Comments from './Comments.js';
import Search from './Search.js'; import Search from './Search.js';
import Submit from './Submit.js'; import Submit from './Submit.js';
import Results from './Results.js';
import ScrollToTop from './ScrollToTop.js'; import ScrollToTop from './ScrollToTop.js';
import Feed from './pages/Feed.js';
import Article from './pages/Article.js';
import Comments from './pages/Comments.js';
import Results from './pages/Results.js';
class App extends React.Component { class App extends React.Component {
constructor(props) { constructor(props) {
@@ -70,7 +71,7 @@ class App extends React.Component {
<Route path='/search' component={Results} /> <Route path='/search' component={Results} />
<Route path='/:id' exact render={(props) => <Article {...props} cache={this.cache} />} /> <Route path='/:id' exact render={(props) => <Article {...props} cache={this.cache} />} />
</Switch> </Switch>
<Route path='/:id/c' exact render={(props) => <Comments {...props} cache={this.cache} />} /> <Route path='/:id/c' exact render={(props) => <Comments {...props} cache={this.cache} key={props.match.params.id} />} />
<ForwardDot /> <ForwardDot />

View File

@@ -1,225 +1,231 @@
body { body {
text-rendering: optimizeLegibility; text-rendering: optimizeLegibility;
font: 1rem/1.3 sans-serif; font: 1rem/1.3 sans-serif;
color: #000000; color: #000000;
margin-bottom: 100vh; margin-bottom: 100vh;
word-break: break-word; word-break: break-word;
font-kerning: normal; font-kerning: normal;
} }
a { a {
color: #000000; color: #000000;
text-decoration: none; text-decoration: none;
outline: none; outline: none;
} }
input { input {
font-size: 1.05rem; font-size: 1.05rem;
background-color: transparent; background-color: transparent;
border: 1px solid #828282; border: 1px solid #828282;
margin: 0.25rem; margin: 0.25rem;
padding: 6px; padding: 6px;
border-radius: 4px; border-radius: 4px;
} }
pre { pre {
overflow: auto; overflow: auto;
} }
.container { .container {
margin: 1rem auto; margin: 1rem auto;
max-width: 64rem; max-width: 64rem;
} }
.menu { .menu {
font-size: 1.1rem; font-size: 1.1rem;
padding: 0 1rem; padding: 0 1rem;
} }
.slogan { .slogan {
color: #828282; color: #828282;
} }
.theme { .theme {
float: right; float: right;
} }
.item { .item {
display: table; display: table;
color: #828282; color: #828282;
margin-bottom: 0.7rem; margin-bottom: 0.7rem;
} }
.item .source-logo { .item .source-logo {
width: 0.9rem; width: 0.9rem;
height: 0.9rem; height: 0.9rem;
} }
.item a { .item a {
color: #828282; color: #828282;
} }
.item a:hover { .item a:hover {
text-decoration: underline; text-decoration: underline;
} }
.item a.link { .item a.link {
font-size: 1.1rem; font-size: 1.1rem;
color: #000000; color: #000000;
} }
.item a.link:visited { .item a.link:visited {
color: #828282; color: #828282;
} }
.item a.link:hover { .item a.link:hover {
text-decoration: none; text-decoration: none;
} }
span.source { span.source {
margin-left: 0.4rem; margin-left: 0.4rem;
} }
.item .info a.hot { .item .info a.hot {
color: #444444; color: #444444;
} }
.article { .article {
padding-bottom: 3rem; padding-bottom: 3rem;
} }
.article-container { .article-container {
margin: 1rem auto; margin: 1rem auto;
max-width: 38rem; max-width: 38rem;
} }
.article a { .article a {
border-bottom: 1px solid #222222; border-bottom: 1px solid #222222;
} }
.article h1 { .article h1 {
font-size: 1.6rem; font-size: 1.6rem;
} }
.article h2 { .article h2 {
font-size: 1.4rem; font-size: 1.4rem;
} }
.article h3, .article h4 { .article h3,
font-size: 1.3rem; .article h4 {
font-size: 1.3rem;
} }
.article img { .article img {
max-width: 100%; max-width: 100%;
height: auto; height: auto;
} }
.article figure, .article video { .article figure,
width: 100%; .article video {
height: auto; width: 100%;
margin: 0; height: auto;
margin: 0;
} }
.article table { .article table {
width: 100%; width: 100%;
table-layout: fixed; table-layout: fixed;
} }
.article iframe { .article iframe {
display: none; display: none;
} }
.article u { .article u {
border-bottom: 1px solid #222; border-bottom: 1px solid #222;
text-decoration: none; text-decoration: none;
} }
.article .info { .article .info {
color: #828282; color: #828282;
} }
.article .info a { .article .info a {
border-bottom: none; border-bottom: none;
color: #828282; color: #828282;
} }
.article .info a:hover { .article .info a:hover {
text-decoration: underline; text-decoration: underline;
} }
.story-text { .story-text {
font: 1.2rem/1.5 'Apparatus SIL', sans-serif; font: 1.2rem/1.5 "Apparatus SIL", sans-serif;
margin-top: 1em; margin-top: 1em;
} }
.comments { .comments {
margin-left: -1.25rem; margin-left: -1.25rem;
} }
.comment { .comment {
padding-left: 1.25rem; padding-left: 1.25rem;
} }
.comment.lined { .comment.lined {
border-left: 1px solid #cccccc; border-left: 1px solid #cccccc;
} }
.comment .text { .comment .text {
margin-top: -0.5rem; margin-top: -0.5rem;
} }
.comment .text.hidden > p { .comment .text.hidden > p {
white-space: nowrap; white-space: nowrap;
overflow: hidden; overflow: hidden;
text-overflow: ellipsis; text-overflow: ellipsis;
display: none; display: none;
color: #828282; color: #828282;
} }
.comment .text.hidden > p:first-child { .comment .text.hidden > p:first-child {
display: block; display: block;
} }
.comment .collapser { .comment .collapser {
padding-left: 0.5rem; padding-left: 0.5rem;
padding-right: 1.5rem; padding-right: 1.5rem;
} }
.comment .pointer { .comment .pointer {
cursor: pointer; cursor: pointer;
} }
.toggleDot { .toggleDot {
position: fixed; position: fixed;
bottom: 1rem; bottom: 1rem;
left: 1rem; left: 1rem;
height: 3rem; height: 3rem;
width: 3rem; width: 3rem;
background-color: #828282; background-color: #828282;
border-radius: 50%; border-radius: 50%;
} }
.toggleDot .button { .toggleDot .button {
font: 2rem/1 'icomoon'; font: 2rem/1 "icomoon";
position: relative; position: relative;
top: 0.5rem; top: 0.5rem;
left: 0.55rem; left: 0.55rem;
} }
.forwardDot { .forwardDot {
cursor: pointer; cursor: pointer;
position: fixed; position: fixed;
bottom: 1rem; bottom: 1rem;
right: 1rem; right: 1rem;
height: 3rem; height: 3rem;
width: 3rem; width: 3rem;
background-color: #828282; background-color: #828282;
border-radius: 50%; border-radius: 50%;
} }
.forwardDot .button { .forwardDot .button {
font: 2.5rem/1 'icomoon'; font: 2.5rem/1 "icomoon";
position: relative; position: relative;
top: 0.25rem; top: 0.25rem;
left: 0.3rem; left: 0.3rem;
} }
.search form { .search form {
display: inline; display: inline;
}
.indented {
padding: 0 0 0 1rem;
} }

View File

@@ -0,0 +1,34 @@
import React from "react";
import { Link } from "react-router-dom";
import { sourceLink, infoLine, getLogoUrl } from "../utils.js";
export class StoryItem extends React.Component {
constructor(props) {
super(props);
}
render() {
const story = this.props.story;
const { id, title } = story;
return (
<div className="item" key={id}>
<div className="title">
<Link className="link" to={"/" + id}>
<img
className="source-logo"
src={getLogoUrl(story)}
alt="source logo"
/>
{" "}
{title}
</Link>
<span className="source">({sourceLink(story)})</span>
</div>
{infoLine(story)}
</div>
);
}
}

View File

@@ -1,7 +1,7 @@
import React from 'react'; import React from 'react';
import { Helmet } from 'react-helmet'; import { Helmet } from 'react-helmet';
import localForage from 'localforage'; import localForage from 'localforage';
import { sourceLink, infoLine, ToggleDot } from './utils.js'; import { sourceLink, infoLine, otherDiscussions, ToggleDot } from '../utils.js';
class Article extends React.Component { class Article extends React.Component {
constructor(props) { constructor(props) {
@@ -14,29 +14,25 @@ class Article extends React.Component {
this.state = { this.state = {
story: cache[id] || false, story: cache[id] || false,
related: [],
error: false, error: false,
pConv: [], pConv: [],
}; };
} }
componentDidMount() { componentDidMount() {
const id = this.props.match ? this.props.match.params.id : 'CLOL'; const id = this.props.match ? this.props.match.params.id : 'CLOL';
localForage.getItem(id) localForage.getItem(id).then((value) => value ? this.setState({ story: value }) : null);
.then( localForage.getItem(`related-${id}`).then((value) => value ? this.setState({ related: value }) : null);
(value) => {
if (value) {
this.setState({ story: value });
}
}
);
fetch('/api/' + id) fetch('/api/' + id)
.then(res => res.json()) .then(res => res.json())
.then( .then(
(result) => { (result) => {
this.setState({ story: result.story }); this.setState({ story: result.story, related: result.related });
localForage.setItem(id, result.story); localForage.setItem(id, result.story);
localForage.setItem(`related-${id}`, result.related);
}, },
(error) => { (error) => {
this.setState({ error: true }); this.setState({ error: true });
@@ -45,12 +41,13 @@ class Article extends React.Component {
} }
pConvert = (n) => { pConvert = (n) => {
this.setState({ pConv: [...this.state.pConv, n]}); this.setState({ pConv: [...this.state.pConv, n] });
} }
render() { render() {
const id = this.props.match ? this.props.match.params.id : 'CLOL'; const id = this.props.match ? this.props.match.params.id : 'CLOL';
const story = this.state.story; const story = this.state.story;
const related = this.state.related.filter(r => r.id != id);
const error = this.state.error; const error = this.state.error;
const pConv = this.state.pConv; const pConv = this.state.pConv;
let nodes = null; let nodes = null;
@@ -77,6 +74,7 @@ class Article extends React.Component {
</div> </div>
{infoLine(story)} {infoLine(story)}
{otherDiscussions(related)}
{nodes ? {nodes ?
<div className='story-text'> <div className='story-text'>
@@ -85,10 +83,10 @@ class Article extends React.Component {
v.innerHTML.split('\n\n').map(x => v.innerHTML.split('\n\n').map(x =>
<p dangerouslySetInnerHTML={{ __html: x }} /> <p dangerouslySetInnerHTML={{ __html: x }} />
) )
: :
(v.nodeName === '#text' ? (v.nodeName === '#text' ?
<p>{v.data}</p> <p>{v.data}</p>
: :
<> <>
<v.localName dangerouslySetInnerHTML={v.innerHTML ? { __html: v.innerHTML } : null} /> <v.localName dangerouslySetInnerHTML={v.innerHTML ? { __html: v.innerHTML } : null} />
{v.localName == 'pre' && <button onClick={() => this.pConvert(k)}>Convert Code to Paragraph</button>} {v.localName == 'pre' && <button onClick={() => this.pConvert(k)}>Convert Code to Paragraph</button>}
@@ -96,11 +94,11 @@ class Article extends React.Component {
) )
)} )}
</div> </div>
: :
<p>Problem getting article :(</p> <p>Problem getting article :(</p>
} }
</div> </div>
: :
<p>loading...</p> <p>loading...</p>
} }
<ToggleDot id={id} article={false} /> <ToggleDot id={id} article={false} />

View File

@@ -4,9 +4,9 @@ import { HashLink } from 'react-router-hash-link';
import { Helmet } from 'react-helmet'; import { Helmet } from 'react-helmet';
import moment from 'moment'; import moment from 'moment';
import localForage from 'localforage'; import localForage from 'localforage';
import { infoLine, ToggleDot } from './utils.js'; import { infoLine, otherDiscussions, ToggleDot } from '../utils.js';
class Article extends React.Component { class Comments extends React.Component {
constructor(props) { constructor(props) {
super(props); super(props);
@@ -17,6 +17,7 @@ class Article extends React.Component {
this.state = { this.state = {
story: cache[id] || false, story: cache[id] || false,
related: [],
error: false, error: false,
collapsed: [], collapsed: [],
expanded: [], expanded: [],
@@ -26,24 +27,21 @@ class Article extends React.Component {
componentDidMount() { componentDidMount() {
const id = this.props.match.params.id; const id = this.props.match.params.id;
localForage.getItem(id) localForage.getItem(id).then((value) => this.setState({ story: value }));
.then( localForage.getItem(`related-${id}`).then((value) => value ? this.setState({ related: value }) : null);
(value) => {
this.setState({ story: value });
}
);
fetch('/api/' + id) fetch('/api/' + id)
.then(res => res.json()) .then(res => res.json())
.then( .then(
(result) => { (result) => {
this.setState({ story: result.story }, () => { this.setState({ story: result.story, related: result.related }, () => {
const hash = window.location.hash.substring(1); const hash = window.location.hash.substring(1);
if (hash) { if (hash) {
document.getElementById(hash).scrollIntoView(); document.getElementById(hash).scrollIntoView();
} }
}); });
localForage.setItem(id, result.story); localForage.setItem(id, result.story);
localForage.setItem(`related-${id}`, result.related);
}, },
(error) => { (error) => {
this.setState({ error: true }); this.setState({ error: true });
@@ -72,7 +70,7 @@ class Article extends React.Component {
} }
displayComment(story, c, level) { displayComment(story, c, level) {
const cid = c.author+c.date; const cid = c.author + c.date;
const collapsed = this.state.collapsed.includes(cid); const collapsed = this.state.collapsed.includes(cid);
const expanded = this.state.expanded.includes(cid); const expanded = this.state.expanded.includes(cid);
@@ -85,19 +83,22 @@ class Article extends React.Component {
<div className='info'> <div className='info'>
<p> <p>
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'} {c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
{' '} | <HashLink to={'#'+cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink> {' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
{hidden || hasChildren && {hasChildren && (
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span> hidden ?
} <span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
:
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span>
)}
</p> </p>
</div> </div>
<div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} /> <div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} />
{hidden && hasChildren ? {hidden && hasChildren ?
<div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c)-1} more]</div> <div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c) - 1} more]</div>
: :
c.comments.map(i => this.displayComment(story, i, level + 1)) c.comments.map(i => this.displayComment(story, i, level + 1))
} }
</div> </div>
@@ -107,6 +108,7 @@ class Article extends React.Component {
render() { render() {
const id = this.props.match.params.id; const id = this.props.match.params.id;
const story = this.state.story; const story = this.state.story;
const related = this.state.related.filter(r => r.id != id);
const error = this.state.error; const error = this.state.error;
return ( return (
@@ -125,12 +127,13 @@ class Article extends React.Component {
</div> </div>
{infoLine(story)} {infoLine(story)}
{otherDiscussions(related)}
<div className='comments'> <div className='comments'>
{story.comments.map(c => this.displayComment(story, c, 0))} {story.comments.map(c => this.displayComment(story, c, 0))}
</div> </div>
</div> </div>
: :
<p>loading...</p> <p>loading...</p>
} }
<ToggleDot id={id} article={true} /> <ToggleDot id={id} article={true} />
@@ -139,4 +142,4 @@ class Article extends React.Component {
} }
} }
export default Article; export default Comments;

View File

@@ -1,8 +1,7 @@
import React from 'react'; import React from 'react';
import { Link } from 'react-router-dom';
import { Helmet } from 'react-helmet'; import { Helmet } from 'react-helmet';
import localForage from 'localforage'; import localForage from 'localforage';
import { sourceLink, infoLine, logos } from './utils.js'; import { StoryItem } from '../components/StoryItem.js';
class Feed extends React.Component { class Feed extends React.Component {
constructor(props) { constructor(props) {
@@ -22,20 +21,24 @@ class Feed extends React.Component {
const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id; const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id;
console.log('updated:', updated); console.log('updated:', updated);
this.setState({ stories: result.stories }); const { stories } = result;
localStorage.setItem('stories', JSON.stringify(result.stories)); this.setState({ stories });
localStorage.setItem('stories', JSON.stringify(stories));
if (updated) { if (updated) {
localForage.clear(); localForage.clear();
result.stories.forEach((x, i) => { stories.forEach((x, i) => {
fetch('/api/' + x.id) fetch('/api/' + x.id)
.then(res => res.json()) .then(res => res.json())
.then(result => { .then(({ story, related }) => {
localForage.setItem(x.id, result.story) Promise.all([
.then(console.log('preloaded', x.id, x.title)); localForage.setItem(x.id, story),
this.props.updateCache(x.id, result.story); localForage.setItem(`related-${x.id}`, related)
}, error => {} ]).then(console.log('preloaded', x.id, x.title));
); this.props.updateCache(x.id, story);
this.props.updateCache(`related-${x.id}`, related);
}, error => { }
);
}); });
} }
}, },
@@ -55,27 +58,7 @@ class Feed extends React.Component {
<title>Feed - QotNews</title> <title>Feed - QotNews</title>
</Helmet> </Helmet>
{error && <p>Connection error?</p>} {error && <p>Connection error?</p>}
{stories ? {stories ? stories.map(story => <StoryItem story={story}></StoryItem>) : <p>loading...</p>}
<div>
{stories.map(x =>
<div className='item' key={x.id}>
<div className='title'>
<Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link>
<span className='source'>
({sourceLink(x)})
</span>
</div>
{infoLine(x)}
</div>
)}
</div>
:
<p>loading...</p>
}
</div> </div>
); );
} }

View File

@@ -1,8 +1,7 @@
import React from 'react'; import React from 'react';
import { Link } from 'react-router-dom';
import { Helmet } from 'react-helmet'; import { Helmet } from 'react-helmet';
import { sourceLink, infoLine, logos } from './utils.js';
import AbortController from 'abort-controller'; import AbortController from 'abort-controller';
import { StoryItem } from '../components/StoryItem.js';
class Results extends React.Component { class Results extends React.Component {
constructor(props) { constructor(props) {
@@ -63,28 +62,10 @@ class Results extends React.Component {
<> <>
<p>Search results:</p> <p>Search results:</p>
<div className='comment lined'> <div className='comment lined'>
{stories.length ? {stories ? stories.map(story => <StoryItem story={story}></StoryItem>) : <p>loading...</p>}
stories.map(x =>
<div className='item' key={x.id}>
<div className='title'>
<Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link>
<span className='source'>
({sourceLink(x)})
</span>
</div>
{infoLine(x)}
</div>
)
:
<p>none</p>
}
</div> </div>
</> </>
: :
<p>loading...</p> <p>loading...</p>
} }
</div> </div>

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff