Compare commits

...

84 Commits

Author SHA1 Message Date
Jason Schwarzenberger
e0960d59f3 update readme. 2020-11-18 13:26:34 +13:00
Jason Schwarzenberger
f5b38f5c6b remove readerserver, add declutter. 2020-11-18 12:59:35 +13:00
Jason Schwarzenberger
c9da2a078b increase setTimeouts. 2020-11-18 10:06:45 +13:00
Jason Schwarzenberger
78654e0c63 reduce setTimeout. 2020-11-17 16:07:33 +13:00
Jason Schwarzenberger
3b885e4327 renaming things. 2020-11-17 15:54:14 +13:00
Jason Schwarzenberger
55d50a86d8 hmmm 2020-11-17 15:13:38 +13:00
Jason Schwarzenberger
55e7f6bb14 cosmetic filters for newshub. 2020-11-17 15:01:12 +13:00
Jason Schwarzenberger
5668fa5dbc fix mistake. 2020-11-17 12:54:54 +13:00
Jason Schwarzenberger
b771b52501 add regex to get a unique ref from each sitemap/category based article url. 2020-11-17 12:38:28 +13:00
Jason Schwarzenberger
f5c7a658ba cosmetic filters for the spinoff. 2020-11-16 16:49:39 +13:00
Jason Schwarzenberger
f5ccd844da fix import error. 2020-11-16 15:41:09 +13:00
Jason Schwarzenberger
6a91b9402f split categories, sitemap and other crap out of news.py 2020-11-16 15:30:33 +13:00
Jason Schwarzenberger
b80c1a5cb5 extract story list item from Results and Feed. 2020-11-16 13:17:58 +13:00
Jason Schwarzenberger
b23e470317 move reddit thresholds as settings variables. 2020-11-16 10:11:39 +13:00
Jason Schwarzenberger
7420b5ece9 fix microdata multiple authors 2020-11-12 17:33:46 +13:00
Jason Schwarzenberger
64ced635cc fix mistake. 2020-11-12 17:15:29 +13:00
Jason Schwarzenberger
9318627f1b ability to pass in multiple site maps/category urls. 2020-11-12 17:11:51 +13:00
Jason Schwarzenberger
3d0a3f1577 support list based json-ld authors. 2020-11-12 15:08:23 +13:00
Jason Schwarzenberger
587b10c438 recursive sitemaps (sitemap indexes) 2020-11-12 14:56:46 +13:00
Jason
00954c6cac local browser scraper 2020-11-11 09:26:54 +00:00
Jason Schwarzenberger
637bc38476 fix mistake. 2020-11-11 17:21:31 +13:00
Jason Schwarzenberger
164b7e72c4 basically add declutter like capabilities. 2020-11-11 17:16:04 +13:00
Jason Schwarzenberger
3169af3002 hostname from settings. 2020-11-11 09:46:27 +13:00
Jason Schwarzenberger
d588a60930 add source to searchable attributes. 2020-11-11 09:37:54 +13:00
Jason Schwarzenberger
408e2870b2 tzinfo and microdata schema urls. 2020-11-10 16:51:27 +13:00
Jason Schwarzenberger
44b8b36547 add data cast in query. 2020-11-10 15:50:18 +13:00
Jason Schwarzenberger
4f49684194 remove logos from utils.js 2020-11-10 15:38:48 +13:00
Jason Schwarzenberger
1d78b1c592 fix favicon url. 2020-11-10 15:34:21 +13:00
Jason Schwarzenberger
0374794536 Sitemap and Category to get favicon into icon property of story. 2020-11-10 15:22:27 +13:00
Jason Schwarzenberger
943a1cfa4f reader server 2020-11-10 14:56:21 +13:00
Jason Schwarzenberger
9cee370a25 tvnz icon 2020-11-10 14:10:02 +13:00
Jason Schwarzenberger
5efc6ef2d3 add related stories (in api only) 2020-11-10 14:09:56 +13:00
Jason Schwarzenberger
4ec50e20cb feed thread loop. 2020-11-10 10:10:38 +13:00
Jason Schwarzenberger
c1b7877f4b remove limit. 2020-11-09 17:54:50 +13:00
Jason Schwarzenberger
7b8cbfc9b9 try to make feed only determined by the max age. 2020-11-09 17:50:58 +13:00
Jason Schwarzenberger
bfa4108a8e Merge remote-tracking branch 'tanner/master' 2020-11-09 16:08:28 +13:00
Jason Schwarzenberger
0bd0d40a31 use json type in sqlite. 2020-11-09 15:45:10 +13:00
Jason Schwarzenberger
4e04595415 fix search. 2020-11-09 15:44:44 +13:00
Jason
006db2960c change to 3 days 2020-11-09 01:36:51 +00:00
Jason Schwarzenberger
1f063f0dac undo log level change 2020-11-06 11:20:34 +13:00
Jason Schwarzenberger
1658346aa9 fix news.py feed. 2020-11-06 10:37:43 +13:00
Jason Schwarzenberger
2dbc702b40 switch to python-dateutil for parser, reverse sort xml feeds. 2020-11-06 10:02:39 +13:00
Jason Schwarzenberger
1c4764e67d sort sitemap feed by lastmod time. 2020-11-06 09:30:15 +13:00
Jason
ee49d2021e newsroom 2020-11-05 20:28:55 +00:00
Jason
c391c50ab1 use localize 2020-11-05 04:15:31 +00:00
Jason Schwarzenberger
095f0d549a use replace. 2020-11-05 16:57:08 +13:00
Jason Schwarzenberger
c21c71667e fix date issue. 2020-11-05 16:41:15 +13:00
Jason Schwarzenberger
c3a2c91a11 update requirements.txt 2020-11-05 16:33:50 +13:00
Jason Schwarzenberger
0f39446a61 tz aware for use in settings. 2020-11-05 16:30:55 +13:00
Jason Schwarzenberger
351059aab1 fix excludes. 2020-11-05 15:59:13 +13:00
Jason Schwarzenberger
4488e2c292 add an excludes list of substrings for urls in the settings for sitemap/category. 2020-11-05 15:51:59 +13:00
Jason Schwarzenberger
afda5b635c disqus test. 2020-11-05 14:23:51 +13:00
Jason Schwarzenberger
0fc1a44d2b fix issue in substack. 2020-11-04 17:40:29 +13:00
Jason Schwarzenberger
9fff1b9e46 avoid duplicate articles listed on the category page 2020-11-04 17:14:42 +13:00
Jason Schwarzenberger
16b59f6c67 try stop bad pages. 2020-11-04 16:34:31 +13:00
Jason Schwarzenberger
939f4775a7 better settings example. 2020-11-04 15:52:34 +13:00
Jason Schwarzenberger
9bfc6fc6fa scraper settings, ordering and loop. 2020-11-04 15:47:12 +13:00
Jason Schwarzenberger
6ea9844d00 remove useless try blocks. 2020-11-04 15:37:19 +13:00
Jason Schwarzenberger
1318259d3d imply referrer is substack. 2020-11-04 15:21:07 +13:00
Jason Schwarzenberger
98a0c2257c increase declutter timeout. 2020-11-04 15:15:00 +13:00
Jason Schwarzenberger
e6976db25d fix tabs 2020-11-04 15:04:20 +13:00
Jason Schwarzenberger
9edc8b7cca move scraping for article content to files. 2020-11-04 15:00:58 +13:00
Jason Schwarzenberger
33e21e7f30 fix mistake. 2020-11-04 12:45:01 +13:00
Jason Schwarzenberger
892a99eca6 add + expander in place of collapser. 2020-11-04 12:43:15 +13:00
Jason Schwarzenberger
d718d05a04 fix dates for newsroom. 2020-11-04 11:53:16 +13:00
Jason Schwarzenberger
d1795eb1b8 add radionz and newsroom logos. 2020-11-04 11:30:56 +13:00
Jason Schwarzenberger
9f4ff4acf0 remove unnecessary sitemap.xml request. 2020-11-04 11:22:15 +13:00
Jason Schwarzenberger
db6aad84ec fix mistake. 2020-11-04 11:12:01 +13:00
Jason Schwarzenberger
29f8a8b8cc add news site categories feed. 2020-11-04 11:08:50 +13:00
Jason
abf8589e02 fix sitemap 2020-11-03 10:53:40 +00:00
Jason
b759f46582 use extruct for opengraph/json-ld/microdata of articles 2020-11-03 10:31:36 +00:00
Jason Schwarzenberger
736cdc8576 fix mistake. 2020-11-03 17:04:46 +13:00
Jason Schwarzenberger
244d416f6e settings config of sitemap/substack publications. 2020-11-03 17:01:29 +13:00
Jason Schwarzenberger
5f98a2e76a Merge remote-tracking branch 'tanner/master' into master
And adding relevant setings.py.example/etc.
2020-11-03 16:44:02 +13:00
Jason Schwarzenberger
0567cdfd9b move sort to render. 2020-11-03 16:30:22 +13:00
Jason Schwarzenberger
4f90671cec order feed by reverse chronological 2020-11-03 16:21:23 +13:00
Jason Schwarzenberger
e63a1456a5 add logos. 2020-11-03 16:07:07 +13:00
Jason Schwarzenberger
76f1d57702 sitemap based feed. 2020-11-03 16:00:03 +13:00
Jason Schwarzenberger
de80389ed0 add logos. 2020-11-03 12:48:19 +13:00
Jason Schwarzenberger
4e64cf682a add the bulletin. 2020-11-03 12:41:16 +13:00
Jason Schwarzenberger
c5fe5d25a0 add substack.py top sites, replacing webworm.py 2020-11-03 12:28:39 +13:00
Jason
283a2b1545 fix webworm comments 2020-11-02 22:06:43 +00:00
Jason Schwarzenberger
0d6a86ace2 fix webworm dates. 2020-11-03 10:31:14 +13:00
Jason Schwarzenberger
f23bf628e0 add webworm/substack as a feed. 2020-11-02 17:09:59 +13:00
33 changed files with 1040 additions and 1399 deletions

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "readerserver"]
path = readerserver
url = https://github.com/master5o1/declutter.git

View File

@@ -20,7 +20,7 @@ $ sudo apt install yarn
Clone this repo:
```text
$ git clone https://gogs.tannercollin.com/tanner/qotnews.git
$ git clone --recurse-submodules https://git.1j.nz/jason/qotnews.git
$ cd qotnews
```
@@ -37,14 +37,14 @@ $ source env/bin/activate
Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps
* Click "Create app"
* Name: whatever
* App type: script
* Description: blank
* About URL: blank
* Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `settings.py` below
- Go to https://www.reddit.com/prefs/apps
- Click "Create app"
- Name: whatever
- App type: script
- Description: blank
- About URL: blank
- Redirect URL: your GitHub profile
- Submit, copy the client ID and client secret into `settings.py` below
```text
(env) $ vim settings.py.example
@@ -109,7 +109,7 @@ stdout_logfile_maxbytes=1MB
[program:qotnewsreader]
user=qotnews
directory=/home/qotnews/qotnews/readerserver
command=node main.js
command=node index.js
autostart=true
autorestart=true
stderr_logfile=/var/log/qotnewsreader.log

View File

@@ -1,9 +1,9 @@
import json
from datetime import datetime, timedelta
from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError
from sqlalchemy.types import JSON
engine = create_engine('sqlite:///data/qotnews.sqlite')
Session = sessionmaker(bind=engine)
@@ -15,8 +15,8 @@ class Story(Base):
sid = Column(String(16), primary_key=True)
ref = Column(String(16), unique=True)
meta_json = Column(String)
full_json = Column(String)
meta = Column(JSON)
data = Column(JSON)
title = Column(String)
class Reflist(Base):
@@ -24,6 +24,7 @@ class Reflist(Base):
rid = Column(Integer, primary_key=True)
ref = Column(String(16), unique=True)
urlref = Column(String)
sid = Column(String, ForeignKey('stories.sid'), unique=True)
source = Column(String(16))
@@ -36,19 +37,21 @@ def get_story(sid):
def put_story(story):
story = story.copy()
full_json = json.dumps(story)
data = {}
data.update(story)
story.pop('text', None)
story.pop('comments', None)
meta_json = json.dumps(story)
meta = {}
meta.update(story)
meta.pop('text', None)
meta.pop('comments', None)
try:
session = Session()
s = Story(
sid=story['id'],
ref=story['ref'],
full_json=full_json,
meta_json=meta_json,
data=data,
meta=meta,
title=story.get('title', None),
)
session.merge(s)
@@ -63,24 +66,32 @@ def get_story_by_ref(ref):
session = Session()
return session.query(Story).filter(Story.ref==ref).first()
def get_reflist(amount):
def get_stories_by_url(url):
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount)
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
return session.query(Story).\
filter(Story.title != None).\
filter(Story.meta['url'].as_string() == url).\
order_by(Story.meta['date'].desc())
def get_stories(amount):
def get_reflist():
session = Session()
q = session.query(Reflist, Story.meta_json).\
order_by(Reflist.rid.desc()).\
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]
def get_stories(maxage=60*60*24*2):
time = datetime.now().timestamp() - maxage
session = Session()
q = session.query(Reflist, Story.meta).\
join(Story).\
filter(Story.title != None).\
limit(amount)
filter(Story.meta['date'].as_integer() > time).\
order_by(Story.meta['date'].desc())
return [x[1] for x in q]
def put_ref(ref, sid, source):
def put_ref(ref, sid, source, urlref):
try:
session = Session()
r = Reflist(ref=ref, sid=sid, source=source)
r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
session.add(r)
session.commit()
except:

View File

@@ -6,61 +6,84 @@ logging.basicConfig(
import requests
import time
from bs4 import BeautifulSoup
import itertools
import settings
from feeds import hackernews, reddit, tildes, manual
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843'
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2
def list():
feed = []
substacks = {}
for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = Category(value)
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = Sitemap(value)
def get_list():
feeds = {}
if settings.NUM_HACKERNEWS:
feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT:
feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count']
feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes')
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed
def get_article(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
html = r.json()['data']['html']
if 'URL is not supported by Outline' in html:
raise Exception('URL not supported by Outline')
return html
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
scrapers = {
'headless': headless,
'simple': simple,
'outline': outline,
'declutter': declutter,
}
available = settings.SCRAPERS or ['headless', 'simple']
if 'simple' not in available:
available += ['simple']
logging.info('Trying our server instead...')
try:
r = requests.post(READ_API, data=dict(url=url), timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return ''
for scraper in available:
if scraper not in scrapers.keys():
continue
try:
html = scrapers[scraper].get_html(url)
if html:
return html
except KeyboardInterrupt:
raise
except:
pass
return ''
def get_content_type(url):
try:
@@ -78,7 +101,7 @@ def get_content_type(url):
except:
return ''
def update_story(story, is_manual=False):
def update_story(story, is_manual=False, urlref=None):
res = {}
if story['source'] == 'hackernews':
@@ -87,6 +110,14 @@ def update_story(story, is_manual=False):
res = reddit.story(story['ref'])
elif story['source'] == 'tildes':
res = tildes.story(story['ref'])
elif story['source'] == 'substack':
res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'], urlref)
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'], urlref)
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual':
res = manual.story(story['ref'])
@@ -96,7 +127,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet')
return False
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
logging.info('Story too old, removing')
return False

View File

@@ -0,0 +1,72 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.api import xml
from misc.news import Base
def _filter_links(links, category_url, excludes=None):
links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
links = list(filter(None, [link if link != category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_category(category_url, excludes=None):
base_url = '/'.join(category_url.split('/')[:3])
markup = xml(lambda x: category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
links = _filter_links(links, category_url, excludes)
return links
class Category(Base):
def __init__(self, config):
self.config = config
self.category_url = config.get('url')
self.tz = config.get('tz')
def feed(self, excludes=None):
links = []
if isinstance(self.category_url, str):
links += _get_category(self.category_url, excludes)
elif isinstance(self.category_url, list):
for url in self.category_url:
links += _get_category(url, excludes)
links = list(set(links))
return [(self.get_id(link), link) for link in links]
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Category: RadioNZ")
site = Category("https://www.rnz.co.nz/news/")
excludes = [
'rnz.co.nz/news/sport',
'rnz.co.nz/weather',
'rnz.co.nz/news/weather',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0]))
print("Category: Newsroom")
site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))

View File

@@ -7,6 +7,8 @@ import requests
import time
from bs4 import BeautifulSoup
import settings
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def api(route):
@@ -33,7 +35,7 @@ def story(ref):
s = {}
s['author'] = 'manual submission'
s['author_link'] = 'https://news.t0.vc'
s['author_link'] = 'https://{}'.format(settings.HOSTNAME)
s['score'] = 0
s['date'] = int(time.time())
s['title'] = str(soup.title.string) if soup.title else ref

View File

@@ -73,7 +73,7 @@ def story(ref):
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.num_comments
if s['score'] < 25 and s['num_comments'] < 10:
if s['score'] < settings.REDDIT_SCORE_THRESHOLD and s['num_comments'] < settings.REDDIT_COMMENT_THRESHOLD:
return False
if r.selftext:

View File

@@ -0,0 +1,99 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from datetime import datetime
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.time import unix
from misc.api import xml
from misc.news import Base
def _get_sitemap_date(a):
if a.find('lastmod'):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return ''
def _filter_links(links, excludes=None):
too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_sitemap(feed_url, excludes=None):
markup = xml(lambda x: feed_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
links = []
feed_urls = []
if soup.find('sitemapindex'):
sitemap = soup.find('sitemapindex').findAll('sitemap')
feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
if soup.find('urlset'):
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
feed_urls = _filter_links(feed_urls, excludes)
links = _filter_links(links, excludes)
for url in feed_urls:
links += _get_sitemap(url, excludes)
return list(set(links))
class Sitemap(Base):
def __init__(self, config):
self.config = config
self.sitemap_url = config.get('url')
self.tz = config.get('tz')
def feed(self, excludes=None):
links = []
if isinstance(self.sitemap_url, str):
links += _get_sitemap(self.sitemap_url, excludes)
elif isinstance(self.sitemap_url, list):
for url in self.sitemap_url:
links += _get_sitemap(url, excludes)
links = list(set(links))
return [(self.get_id(link), link) for link in links]
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: The Spinoff")
site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
excludes = [
'thespinoff.co.nz/sitemap-misc.xml',
'thespinoff.co.nz/sitemap-authors.xml',
'thespinoff.co.nz/sitemap-tax-category.xml',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0]))
print("Sitemap: Newshub")
site = Sitemap([
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
])
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))
print(site.story(posts[:-1]))

165
apiserver/feeds/substack.py Normal file
View File

@@ -0,0 +1,165 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from utils import clean
SUBSTACK_REFERER = 'https://substack.com'
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
def author_link(author_id, base_url):
return f"{base_url}/people/{author_id}"
def api_comments(post_id, base_url):
return f"{base_url}/api/v1/post/{post_id}/comments?all_comments=true&sort=best_first"
def api_stories(x, base_url):
return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100"
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
def api(route, ref=None, referer=None):
headers = {'Referer': referer} if referer else None
try:
r = requests.get(route(ref), headers=headers, timeout=10)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
try:
r = requests.get(route(ref), headers=headers, timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting Substack API: {}'.format(str(e)))
return False
def comment(i):
if 'body' not in i:
return False
c = {}
c['date'] = unix(i.get('date'))
c['author'] = i.get('name', '')
c['score'] = i.get('reactions').get('')
c['text'] = clean(i.get('body', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
class Publication:
def __init__(self, domain):
self.BASE_DOMAIN = domain
def feed(self):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []]
def story(self, ref):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
if len(stories) == 0:
return False
r = stories[0]
if not r:
return False
s = {}
s['author'] = ''
s['author_link'] = ''
s['date'] = unix(r.get('post_date'))
s['score'] = r.get('reactions').get('')
s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0)
authors = list(filter(None, [self._bylines(byline) for byline in r.get('publishedBylines')]))
if len(authors):
s['author'] = authors[0].get('name')
s['author_link'] = authors[0].get('link')
return s
def _bylines(self, b):
if 'id' not in b:
return None
a = {}
a['name'] = b.get('name')
a['link'] = author_link(b.get('id'), self.BASE_DOMAIN)
return a
class Top:
def feed(self):
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []]
def story(self, ref):
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
if len(stories) == 0:
return False
r = stories[0]
if not r:
return False
s = {}
pub = r.get('pub')
base_url = pub.get('base_url')
s['author'] = pub.get('author_name')
s['author_link'] = author_link(pub.get('author_id'), base_url)
s['date'] = unix(r.get('post_date'))
s['score'] = r.get('score')
s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0)
return s
top = Top()
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
top_posts = top.feed()
print(top.story(top_posts[0]))
webworm = Publication("https://www.webworm.co/")
posts = webworm.feed()
print(webworm.story(posts[0]))

35
apiserver/misc/api.py Normal file
View File

@@ -0,0 +1,35 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
FORWARD_IP = '66.249.66.1'
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def json(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False

View File

@@ -0,0 +1,69 @@
def parse_extruct(s, data):
rdfa_keys = {
'title': [
'http://ogp.me/ns#title',
'https://ogp.me/ns#title',
],
'date': [
'http://ogp.me/ns/article#modified_time',
'https://ogp.me/ns/article#modified_time',
'http://ogp.me/ns/article#published_time',
'https://ogp.me/ns/article#published_time',
]
}
for rdfa in data['rdfa']:
for key, props in rdfa.items():
for attribute, properties in rdfa_keys.items():
for prop in properties:
if prop in props:
for values in props[prop]:
s[attribute] = values['@value']
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = modified[0]
if len(published):
s['date'] = published[0]
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = props['dateModified']
if props['datePublished']:
s['date'] = props['datePublished']
if 'author' in props and props['author']:
if 'properties' in props['author']:
s['author'] = props['author']['properties']['name']
elif isinstance(props['author'], list):
s['author'] = props['author'][0]['properties']['name']
for ld in data['json-ld']:
if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = ld['dateModified']
if ld['datePublished']:
s['date'] = ld['datePublished']
if 'author' in ld and ld['author']:
if 'name' in ld['author']:
s['author'] = ld['author']['name']
elif isinstance(ld['author'], list):
s['author'] = ld['author'][0]['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s

101
apiserver/misc/news.py Normal file
View File

@@ -0,0 +1,101 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import re
import requests
from bs4 import BeautifulSoup
from scrapers import declutter
import extruct
import settings
from utils import clean
from misc.metadata import parse_extruct
from misc.time import unix
from misc.api import xml
def comment(i):
if 'author' not in i:
return False
c = {}
c['author'] = i.get('author', '')
c['score'] = i.get('points', 0)
c['date'] = unix(i.get('date', 0))
c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class Base:
def __init__(config):
self.config = config
self.url = config.get('url')
self.tz = config.get('tz')
def get_id(self, link):
patterns = self.config.get('patterns')
if not patterns:
return link
patterns = [re.compile(p) for p in patterns]
patterns = list(filter(None, [p.match(link) for p in patterns]))
patterns = list(set([':'.join(p.groups()) for p in patterns]))
if not patterns:
return link
return patterns[0]
def feed(self, excludes=None):
return []
def story(self, ref, urlref):
if urlref is None:
return False
markup = xml(lambda x: urlref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = urlref
s['url'] = urlref
s['date'] = 0
soup = BeautifulSoup(markup, features='html.parser')
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
favicon = soup.find_all('link', rel="shortcut icon", href=True)
others = soup.find_all('link', rel="icon", href=True)
icons = icon32 + icon16 + favicon + others
base_url = '/'.join(urlref.split('/')[:3])
icons = list(set([i.get('href') for i in icons]))
icons = [i if i.startswith('http') else base_url + i for i in icons]
if icons:
s['icon'] = icons[0]
data = extruct.extract(markup)
s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(urlref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s

18
apiserver/misc/time.py Normal file
View File

@@ -0,0 +1,18 @@
import pytz
import dateutil.parser
TZINFOS = {
'NZDT': pytz.timezone('Pacific/Auckland'),
'NZST': pytz.timezone('Pacific/Auckland')
}
def unix(date_str, tz=None, tzinfos=TZINFOS):
try:
dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
if tz:
dt = pytz.timezone(tz).localize(dt)
return int(dt.timestamp())
except:
pass
return 0

View File

@@ -4,6 +4,7 @@ certifi==2020.6.20
chardet==3.0.4
click==7.1.2
commonmark==0.9.1
extruct==0.10.0
Flask==1.1.2
Flask-Cors==3.0.8
gevent==20.6.2
@@ -11,11 +12,13 @@ greenlet==0.4.16
idna==2.10
itsdangerous==1.1.0
Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1
packaging==20.4
praw==6.4.0
prawcore==1.4.0
pyparsing==2.4.7
pytz==2020.4
requests==2.24.0
six==1.15.0
soupsieve==2.0.1
@@ -27,3 +30,4 @@ websocket-client==0.57.0
Werkzeug==1.0.1
zope.event==4.4
zope.interface==5.1.0
python-dateutil==2.8.1

View File

@@ -0,0 +1,41 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
TIMEOUT = 30
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@@ -0,0 +1,42 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000)
TIMEOUT = 60
def get_html(url):
logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@@ -0,0 +1,37 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
TIMEOUT = 20
def get_html(url):
details = get_details(url)
if not details:
return ''
return details['html']
def get_details(url):
try:
logging.info(f"Outline Scraper: {url}")
params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return data
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return None

View File

@@ -0,0 +1,28 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:{}/simple/details'.format(READER_PORT or 3000)
TIMEOUT = 20
def get_html(url):
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

View File

@@ -35,14 +35,11 @@ def update_rankings():
def update_attributes():
try:
json = ['title', 'url', 'author', 'link', 'id']
json = ['title', 'url', 'author', 'link', 'id', 'source']
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
return r.json()
except KeyboardInterrupt:
raise

View File

@@ -15,6 +15,7 @@ import traceback
import time
from urllib.parse import urlparse, parse_qs
import settings
import database
import search
import feed
@@ -27,9 +28,6 @@ from flask_cors import CORS
database.init()
search.init()
FEED_LENGTH = 75
news_index = 0
def new_id():
nid = gen_rand_id()
while database.get_story(nid):
@@ -42,9 +40,8 @@ cors = CORS(flask_app)
@flask_app.route('/api')
def api():
stories = database.get_stories(FEED_LENGTH)
# hacky nested json
res = Response('{"stories":[' + ','.join(stories) + ']}')
stories = database.get_stories(settings.MAX_STORY_AGE)
res = Response(json.dumps({"stories": stories}))
res.headers['content-type'] = 'application/json'
return res
@@ -73,7 +70,7 @@ def submit():
elif 'reddit.com' in parse.hostname and 'comments' in url:
source = 'reddit'
ref = parse.path.split('/')[4]
elif 'news.t0.vc' in parse.hostname:
elif settings.HOSTNAME in parse.hostname:
raise Exception('Invalid article')
else:
source = 'manual'
@@ -102,8 +99,9 @@ def submit():
def story(sid):
story = database.get_story(sid)
if story:
# hacky nested json
res = Response('{"story":' + story.full_json + '}')
related = database.get_stories_by_url(story.meta['url'])
related = [r.meta for r in related]
res = Response(json.dumps({"story": story.data, "related": related}))
res.headers['content-type'] = 'application/json'
return res
else:
@@ -114,7 +112,7 @@ def story(sid):
def index():
return render_template('index.html',
title='Feed',
url='news.t0.vc',
url=settings.HOSTNAME,
description='Reddit, Hacker News, and Tildes combined, then pre-rendered in reader mode')
@flask_app.route('/<sid>', strict_slashes=False)
@@ -127,7 +125,7 @@ def static_story(sid):
story = database.get_story(sid)
if not story: return abort(404)
story = json.loads(story.full_json)
story = story.data
score = story['score']
num_comments = story['num_comments']
@@ -144,54 +142,51 @@ def static_story(sid):
url=url,
description=description)
http_server = WSGIServer(('', 33842), flask_app)
http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app)
def _add_new_refs():
for ref, source, urlref in feed.get_list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source, urlref)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
def _update_current_story(item):
try:
story = database.get_story(item['sid']).data
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story, urlref=item['urlref'])
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
def feed_thread():
global news_index
ref_list = []
try:
while True:
# onboard new stories
if news_index == 0:
for ref, source in feed.list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
ref_list = database.get_reflist(FEED_LENGTH)
if not len(ref_list):
_add_new_refs()
ref_list = database.get_reflist()
# update current stories
if news_index < len(ref_list):
item = ref_list[news_index]
try:
story_json = database.get_story(item['sid']).full_json
story = json.loads(story_json)
except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story)
if valid:
database.put_story(story)
search.put_story(story)
else:
database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref']))
else:
logging.info('Skipping index: ' + str(news_index))
if len(ref_list):
item = ref_list.pop(0)
_update_current_story(item)
gevent.sleep(6)
news_index += 1
if news_index == FEED_LENGTH: news_index = 0
except KeyboardInterrupt:
logging.info('Ending feed thread...')
except ValueError as e:

View File

@@ -1,12 +1,59 @@
# QotNews settings
# edit this file and save it as settings.py
HOSTNAME = 'news.t0.vc'
MAX_STORY_AGE = 3*24*60*60
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
API_PORT = 33842
READER_PORT = 3000
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site
NUM_HACKERNEWS = 15
NUM_REDDIT = 10
NUM_TILDES = 5
NUM_SUBSTACK = 10
SITEMAP = {}
# SITEMAP['nzherald'] = {
# 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
# 'count': 20,
# 'patterns': [
# r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
# ],
# 'excludes': [
# 'driven.co.nz',
# 'oneroof.co.nz',
# 'nzherald.co.nz/sponsored-stories',
# 'nzherald.co.nz/entertainment/',
# 'nzherald.co.nz/lifestyle/',
# 'nzherald.co.nz/travel/',
# 'nzherald.co.nz/sport/',
# 'nzherald.co.nz/promotions/',
# 'nzherald.co.nzhttp',
# 'herald-afternoon-quiz',
# 'herald-morning-quiz'
# ],
# }
SUBSTACK = {}
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
CATEGORY = {}
# CATEGORY['radionz'] = {
# 'url': "https://www.rnz.co.nz/news/",
# 'count': 20,
# 'patterns': [
# r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
# ],
# 'excludes': [
# 'rnz.co.nz/news/sport',
# 'rnz.co.nz/weather',
# ],
# }
# Reddit account info
# leave blank if not using Reddit
@@ -14,6 +61,10 @@ REDDIT_CLIENT_ID = ''
REDDIT_CLIENT_SECRET = ''
REDDIT_USER_AGENT = ''
# Minimum points or number of comments before including a thread:
REDDIT_COMMENT_THRESHOLD = 10
REDDIT_SCORE_THRESHOLD = 25
SUBREDDITS = [
'Economics',
'AcademicPhilosophy',

1
readerserver Submodule

Submodule readerserver added at 9c0336b0af

View File

@@ -1,92 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Editor
*.swp
*.swo

View File

@@ -1,53 +0,0 @@
const express = require('express');
const app = express();
const port = 33843;
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>');
});
const requestCallback = (url, res) => (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log('Response OK.');
const doc = new JSDOM(body, {url: url});
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (article && article.content) {
res.send(article.content);
} else {
res.sendStatus(404);
}
} else {
console.log('Response error:', error ? error.toString() : response.statusCode);
res.sendStatus(response ? response.statusCode : 404);
}
};
app.post('/', (req, res) => {
const url = req.body.url;
const requestOptions = {
url: url,
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
//headers: {'User-Agent': 'Twitterbot/1.0'},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Forwarded-For': '66.249.66.1',
},
};
console.log('Parse request for:', url);
request(requestOptions, requestCallback(url, res));
});
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);
});

View File

@@ -1,13 +0,0 @@
{
"name": "readerserver",
"version": "1.0.0",
"main": "main.js",
"license": "MIT",
"dependencies": {
"dompurify": "^1.0.11",
"express": "^4.17.1",
"jsdom": "^15.1.1",
"readability": "https://github.com/mozilla/readability",
"request": "^2.88.0"
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,13 +5,14 @@ import './Style-light.css';
import './Style-dark.css';
import './fonts/Fonts.css';
import { ForwardDot } from './utils.js';
import Feed from './Feed.js';
import Article from './Article.js';
import Comments from './Comments.js';
import Search from './Search.js';
import Submit from './Submit.js';
import Results from './Results.js';
import ScrollToTop from './ScrollToTop.js';
import Feed from './pages/Feed.js';
import Article from './pages/Article.js';
import Comments from './pages/Comments.js';
import Results from './pages/Results.js';
class App extends React.Component {
constructor(props) {

View File

@@ -0,0 +1,34 @@
import React from "react";
import { Link } from "react-router-dom";
import { sourceLink, infoLine, getLogoUrl } from "../utils.js";
export class StoryItem extends React.Component {
constructor(props) {
super(props);
}
render() {
const story = this.props.story;
const { id, title } = story;
return (
<div className="item" key={id}>
<div className="title">
<Link className="link" to={"/" + id}>
<img
className="source-logo"
src={getLogoUrl(story)}
alt="source logo"
/>
{" "}
{title}
</Link>
<span className="source">({sourceLink(story)})</span>
</div>
{infoLine(story)}
</div>
);
}
}

View File

@@ -1,7 +1,7 @@
import React from 'react';
import { Helmet } from 'react-helmet';
import localForage from 'localforage';
import { sourceLink, infoLine, ToggleDot } from './utils.js';
import { sourceLink, infoLine, ToggleDot } from '../utils.js';
class Article extends React.Component {
constructor(props) {
@@ -18,7 +18,7 @@ class Article extends React.Component {
pConv: [],
};
}
componentDidMount() {
const id = this.props.match ? this.props.match.params.id : 'CLOL';
@@ -45,7 +45,7 @@ class Article extends React.Component {
}
pConvert = (n) => {
this.setState({ pConv: [...this.state.pConv, n]});
this.setState({ pConv: [...this.state.pConv, n] });
}
render() {
@@ -85,10 +85,10 @@ class Article extends React.Component {
v.innerHTML.split('\n\n').map(x =>
<p dangerouslySetInnerHTML={{ __html: x }} />
)
:
:
(v.nodeName === '#text' ?
<p>{v.data}</p>
:
:
<>
<v.localName dangerouslySetInnerHTML={v.innerHTML ? { __html: v.innerHTML } : null} />
{v.localName == 'pre' && <button onClick={() => this.pConvert(k)}>Convert Code to Paragraph</button>}
@@ -96,11 +96,11 @@ class Article extends React.Component {
)
)}
</div>
:
:
<p>Problem getting article :(</p>
}
</div>
:
:
<p>loading...</p>
}
<ToggleDot id={id} article={false} />

View File

@@ -4,7 +4,7 @@ import { HashLink } from 'react-router-hash-link';
import { Helmet } from 'react-helmet';
import moment from 'moment';
import localForage from 'localforage';
import { infoLine, ToggleDot } from './utils.js';
import { infoLine, ToggleDot } from '../utils.js';
class Article extends React.Component {
constructor(props) {
@@ -72,7 +72,7 @@ class Article extends React.Component {
}
displayComment(story, c, level) {
const cid = c.author+c.date;
const cid = c.author + c.date;
const collapsed = this.state.collapsed.includes(cid);
const expanded = this.state.expanded.includes(cid);
@@ -85,19 +85,22 @@ class Article extends React.Component {
<div className='info'>
<p>
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
{' '} | <HashLink to={'#'+cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
{' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
{hidden || hasChildren &&
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span>
}
{hasChildren && (
hidden ?
<span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
:
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span>
)}
</p>
</div>
<div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} />
<div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} />
{hidden && hasChildren ?
<div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c)-1} more]</div>
:
<div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c) - 1} more]</div>
:
c.comments.map(i => this.displayComment(story, i, level + 1))
}
</div>
@@ -130,7 +133,7 @@ class Article extends React.Component {
{story.comments.map(c => this.displayComment(story, c, 0))}
</div>
</div>
:
:
<p>loading...</p>
}
<ToggleDot id={id} article={true} />

View File

@@ -1,8 +1,7 @@
import React from 'react';
import { Link } from 'react-router-dom';
import { Helmet } from 'react-helmet';
import localForage from 'localforage';
import { sourceLink, infoLine, logos } from './utils.js';
import { StoryItem } from '../components/StoryItem.js';
class Feed extends React.Component {
constructor(props) {
@@ -22,20 +21,21 @@ class Feed extends React.Component {
const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id;
console.log('updated:', updated);
this.setState({ stories: result.stories });
localStorage.setItem('stories', JSON.stringify(result.stories));
const { stories } = result;
this.setState({ stories });
localStorage.setItem('stories', JSON.stringify(stories));
if (updated) {
localForage.clear();
result.stories.forEach((x, i) => {
stories.forEach((x, i) => {
fetch('/api/' + x.id)
.then(res => res.json())
.then(result => {
localForage.setItem(x.id, result.story)
.then(({ story }) => {
localForage.setItem(x.id, story)
.then(console.log('preloaded', x.id, x.title));
this.props.updateCache(x.id, result.story);
}, error => {}
);
this.props.updateCache(x.id, story);
}, error => { }
);
});
}
},
@@ -55,27 +55,7 @@ class Feed extends React.Component {
<title>Feed - QotNews</title>
</Helmet>
{error && <p>Connection error?</p>}
{stories ?
<div>
{stories.map(x =>
<div className='item' key={x.id}>
<div className='title'>
<Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link>
<span className='source'>
({sourceLink(x)})
</span>
</div>
{infoLine(x)}
</div>
)}
</div>
:
<p>loading...</p>
}
{stories ? stories.map(story => <StoryItem story={story}></StoryItem>) : <p>loading...</p>}
</div>
);
}

View File

@@ -1,8 +1,7 @@
import React from 'react';
import { Link } from 'react-router-dom';
import { Helmet } from 'react-helmet';
import { sourceLink, infoLine, logos } from './utils.js';
import AbortController from 'abort-controller';
import { StoryItem } from '../components/StoryItem.js';
class Results extends React.Component {
constructor(props) {
@@ -63,28 +62,10 @@ class Results extends React.Component {
<>
<p>Search results:</p>
<div className='comment lined'>
{stories.length ?
stories.map(x =>
<div className='item' key={x.id}>
<div className='title'>
<Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link>
<span className='source'>
({sourceLink(x)})
</span>
</div>
{infoLine(x)}
</div>
)
:
<p>none</p>
}
{stories ? stories.map(story => <StoryItem story={story}></StoryItem>) : <p>loading...</p>}
</div>
</>
:
:
<p>loading...</p>
}
</div>

File diff suppressed because one or more lines are too long