Compare commits

...

33 Commits

Author SHA1 Message Date
Jason Schwarzenberger bfa4108a8e Merge remote-tracking branch 'tanner/master' 4 years ago
Jason Schwarzenberger 0bd0d40a31 use json type in sqlite. 4 years ago
Jason Schwarzenberger 4e04595415 fix search. 4 years ago
Jason 006db2960c change to 3 days 4 years ago
Jason Schwarzenberger 1f063f0dac undo log level change 4 years ago
Jason Schwarzenberger 1658346aa9 fix news.py feed. 4 years ago
Jason Schwarzenberger 2dbc702b40 switch to python-dateutil for parser, reverse sort xml feeds. 4 years ago
Jason Schwarzenberger 1c4764e67d sort sitemap feed by lastmod time. 4 years ago
Jason ee49d2021e newsroom 4 years ago
Jason c391c50ab1 use localize 4 years ago
Jason Schwarzenberger 095f0d549a use replace. 4 years ago
Jason Schwarzenberger c21c71667e fix date issue. 4 years ago
Jason Schwarzenberger c3a2c91a11 update requirements.txt 4 years ago
Jason Schwarzenberger 0f39446a61 tz aware for use in settings. 4 years ago
Jason Schwarzenberger 351059aab1 fix excludes. 4 years ago
Jason Schwarzenberger 4488e2c292 add an `excludes` list of substrings for urls in the settings for sitemap/category. 4 years ago
Jason Schwarzenberger afda5b635c disqus test. 4 years ago
Jason Schwarzenberger 0fc1a44d2b fix issue in substack. 4 years ago
Jason Schwarzenberger 9fff1b9e46 avoid duplicate articles listed on the category page 4 years ago
Jason Schwarzenberger 16b59f6c67 try stop bad pages. 4 years ago
Jason Schwarzenberger 939f4775a7 better settings example. 4 years ago
Jason Schwarzenberger 9bfc6fc6fa scraper settings, ordering and loop. 4 years ago
Jason Schwarzenberger 6ea9844d00 remove useless try blocks. 4 years ago
Jason Schwarzenberger 1318259d3d imply referrer is substack. 4 years ago
Jason Schwarzenberger 98a0c2257c increase declutter timeout. 4 years ago
Jason Schwarzenberger e6976db25d fix tabs 4 years ago
Jason Schwarzenberger 9edc8b7cca move scraping for article content to files. 4 years ago
Jason Schwarzenberger 33e21e7f30 fix mistake. 4 years ago
Jason Schwarzenberger 892a99eca6 add + expander in place of collapser. 4 years ago
Jason Schwarzenberger d718d05a04 fix dates for newsroom. 4 years ago
Jason Schwarzenberger d1795eb1b8 add radionz and newsroom logos. 4 years ago
Tanner Collin 9a279d44b1 Add header to get content type 4 years ago
Tanner Collin e506804666 Clean code up 4 years ago
  1. 23
      apiserver/database.py
  2. 83
      apiserver/feed.py
  3. 181
      apiserver/feeds/news.py
  4. 25
      apiserver/feeds/substack.py
  5. 2
      apiserver/requirements.txt
  6. 41
      apiserver/scrapers/declutter.py
  7. 27
      apiserver/scrapers/local.py
  8. 37
      apiserver/scrapers/outline.py
  9. 5
      apiserver/search.py
  10. 11
      apiserver/server.py
  11. 21
      apiserver/settings.py.example
  12. 52
      readerserver/main.js
  13. 43
      readerserver/simple.js
  14. 23
      webclient/src/Comments.js
  15. 12
      webclient/src/Feed.js
  16. 6
      webclient/src/Results.js
  17. 2
      webclient/src/utils.js

@ -4,6 +4,7 @@ from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.types import JSON
engine = create_engine('sqlite:///data/qotnews.sqlite') engine = create_engine('sqlite:///data/qotnews.sqlite')
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
@ -15,8 +16,8 @@ class Story(Base):
sid = Column(String(16), primary_key=True) sid = Column(String(16), primary_key=True)
ref = Column(String(16), unique=True) ref = Column(String(16), unique=True)
meta_json = Column(String) meta = Column(JSON)
full_json = Column(String) data = Column(JSON)
title = Column(String) title = Column(String)
class Reflist(Base): class Reflist(Base):
@ -36,19 +37,21 @@ def get_story(sid):
def put_story(story): def put_story(story):
story = story.copy() story = story.copy()
full_json = json.dumps(story) data = {}
data.update(story)
story.pop('text', None) meta = {}
story.pop('comments', None) meta.update(story)
meta_json = json.dumps(story) meta.pop('text', None)
meta.pop('comments', None)
try: try:
session = Session() session = Session()
s = Story( s = Story(
sid=story['id'], sid=story['id'],
ref=story['ref'], ref=story['ref'],
full_json=full_json, data=data,
meta_json=meta_json, meta=meta,
title=story.get('title', None), title=story.get('title', None),
) )
session.merge(s) session.merge(s)
@ -70,10 +73,10 @@ def get_reflist(amount):
def get_stories(amount): def get_stories(amount):
session = Session() session = Session()
q = session.query(Reflist, Story.meta_json).\ q = session.query(Reflist, Story.meta).\
order_by(Reflist.rid.desc()).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
order_by(Story.meta['date'].desc()).\
limit(amount) limit(amount)
return [x[1] for x in q] return [x[1] for x in q]

@ -9,22 +9,23 @@ from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local
OUTLINE_API = 'https://api.outline.com/v3/parse_article' ONE_HOUR = 60*60
READ_API = 'http://127.0.0.1:33843' ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url']) substacks[key] = substack.Publication(value['url'])
categories = {} categories = {}
for key, value in settings.CATEGORY.items(): for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url']) categories[key] = news.Category(value['url'], value.get('tz'))
sitemaps = {} sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url']) sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def list(): def list():
feed = [] feed = []
@ -45,53 +46,49 @@ def list():
feed += [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key]['count'] count = settings.CATEGORY[key].get('count') or 0
feed += [(x, key) for x in sites.feed()[:count]] excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key]['count'] count = settings.SITEMAP[key].get('count') or 0
feed += [(x, key) for x in sites.feed()[:count]] excludes = settings.SITEMAP[key].get('excludes')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
return feed return feed
def get_article(url): def get_article(url):
try: scrapers = {
params = {'source_url': url} 'declutter': declutter,
headers = {'Referer': 'https://outline.com/'} 'outline': outline,
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) 'local': local,
if r.status_code == 429: }
logging.info('Rate limited by outline, sleeping 30s and skipping...') available = settings.SCRAPERS or ['local']
time.sleep(30) if 'local' not in available:
return '' available += ['local']
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) for scraper in available:
html = r.json()['data']['html'] if scraper not in scrapers.keys():
if 'URL is not supported by Outline' in html: continue
raise Exception('URL not supported by Outline') try:
return html html = scrapers[scraper].get_html(url)
except KeyboardInterrupt: if html:
raise return html
except BaseException as e: except KeyboardInterrupt:
logging.error('Problem outlining article: {}'.format(str(e))) raise
except:
logging.info('Trying our server instead...') pass
return ''
try:
r = requests.post(READ_API, data=dict(url=url), timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return ''
def get_content_type(url): def get_content_type(url):
try: try:
headers = {'User-Agent': 'Twitterbot/1.0'} headers = {
return requests.get(url, headers=headers, timeout=2).headers['content-type'] 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
}
return requests.get(url, headers=headers, timeout=5).headers['content-type']
except: except:
pass pass
@ -127,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

@ -10,29 +10,27 @@ if __name__ == '__main__':
import requests import requests
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapers import declutter
import dateutil.parser
import extruct import extruct
import pytz
from utils import clean from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
def unix(date_str): def unix(date_str, tz=None):
date_tzfix = date_str try:
if ":" == date_tzfix[-3]: dt = dateutil.parser.parse(date_str)
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:] if tz:
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z'] dt = pytz.timezone(tz).localize(dt)
for f in formats: return int(dt.timestamp())
try: except:
return int(datetime.strptime(date_str, f).timestamp()) pass
except:
pass
try:
return int(datetime.strptime(date_tzfix, f).timestamp())
except:
pass
return 0 return 0
def xml(route, ref=None): def xml(route, ref=None):
try: try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
@ -46,6 +44,7 @@ def xml(route, ref=None):
logging.error('Problem hitting URL: {}'.format(str(e))) logging.error('Problem hitting URL: {}'.format(str(e)))
return False return False
def parse_extruct(s, data): def parse_extruct(s, data):
for rdfa in data['rdfa']: for rdfa in data['rdfa']:
for key, props in rdfa.items(): for key, props in rdfa.items():
@ -54,22 +53,19 @@ def parse_extruct(s, data):
s['title'] = values['@value'] s['title'] = values['@value']
if 'http://ogp.me/ns/article#modified_time' in props: if 'http://ogp.me/ns/article#modified_time' in props:
for values in props['http://ogp.me/ns/article#modified_time']: for values in props['http://ogp.me/ns/article#modified_time']:
print(f"modified_time: {values['@value']}") s['date'] = values['@value']
s['date'] = unix(values['@value'])
if 'http://ogp.me/ns/article#published_time' in props: if 'http://ogp.me/ns/article#published_time' in props:
for values in props['http://ogp.me/ns/article#published_time']: for values in props['http://ogp.me/ns/article#published_time']:
print(f"published_time: {values['@value']}") s['date'] = values['@value']
s['date'] = unix(values['@value'])
for og in data['opengraph']: for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']])) modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']])) published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified): if len(modified):
s['date'] = unix(modified[0]) s['date'] = modified[0]
if len(published): if len(published):
s['date'] = unix(published[0]) s['date'] = published[0]
s['date'] = unix(published[0] or modified[0] or '')
if len(titles): if len(titles):
s['title'] = titles[0] s['title'] = titles[0]
@ -78,35 +74,56 @@ def parse_extruct(s, data):
props = md['properties'] props = md['properties']
s['title'] = props['headline'] s['title'] = props['headline']
if props['dateModified']: if props['dateModified']:
s['date'] = unix(props['dateModified']) s['date'] = props['dateModified']
if props['datePublished']: if props['datePublished']:
s['date'] = unix(props['datePublished']) s['date'] = props['datePublished']
if 'author' in props and props['author']: if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name'] s['author'] = props['author']['properties']['name']
for ld in data['json-ld']: for ld in data['json-ld']:
if ld['@type'] == 'Article': if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline'] s['title'] = ld['headline']
if ld['dateModified']: if ld['dateModified']:
s['date'] = unix(ld['dateModified']) s['date'] = ld['dateModified']
if ld['datePublished']: if ld['datePublished']:
s['date'] = unix(ld['datePublished']) s['date'] = ld['datePublished']
if 'author' in ld and ld['author']: if 'author' in ld and ld['author']:
s['author'] = ld['author']['name'] s['author'] = ld['author']['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s return s
class Sitemap: def comment(i):
def __init__(self, url): if 'author' not in i:
self.sitemap_url = url return False
def feed(self): c = {}
markup = xml(lambda x: self.sitemap_url) c['author'] = i.get('author', '')
if not markup: return [] c['score'] = i.get('points', 0)
soup = BeautifulSoup(markup, features='lxml') c['date'] = unix(i.get('date', 0))
articles = soup.find('urlset').findAll('url') c['text'] = clean(i.get('text', '') or '')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) c['comments'] = [comment(j) for j in i['children']]
return [x.find('loc').text for x in articles] or [] c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class _Base:
def __init__(url, tz=None):
self.url = url
self.tz = tz
def feed(self, excludes=None):
return []
def story(self, ref): def story(self, ref):
markup = xml(lambda x: ref) markup = xml(lambda x: ref)
@ -124,14 +141,58 @@ class Sitemap:
data = extruct.extract(markup) data = extruct.extract(markup)
s = parse_extruct(s, data) s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(ref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s return s
class Category: def get_sitemap_date(a):
def __init__(self, url): if a.find('lastmod'):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
return ''
class Sitemap(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.sitemap_url = url
def feed(self, excludes=None):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
class Category(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.category_url = url self.category_url = url
self.base_url = '/'.join(url.split('/')[:3]) self.base_url = '/'.join(url.split('/')[:3])
def feed(self): def feed(self, excludes=None):
markup = xml(lambda x: self.category_url) markup = xml(lambda x: self.category_url)
if not markup: return [] if not markup: return []
soup = BeautifulSoup(markup, features='html.parser') soup = BeautifulSoup(markup, features='html.parser')
@ -139,42 +200,30 @@ class Category:
links = [link.get('href') for link in links] links = [link.get('href') for link in links]
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links] links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links])) links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
links = list(filter(None, [link if link != self.category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
print("Sitemap: Stuff") print("Sitemap: Stuff")
site = Sitemap("https://www.stuff.co.nz/sitemap.xml") site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Sitemap: NZ Herald") print("Category: RadioNZ Te Ao Māori")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori") print("Sitemap: Newsroom")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/") site = Sitemap("https://www.newsroom.co.nz/sitemap.xml")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0]))

@ -12,6 +12,7 @@ from datetime import datetime
from utils import clean from utils import clean
SUBSTACK_REFERER = 'https://substack.com'
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts" SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
def author_link(author_id, base_url): def author_link(author_id, base_url):
@ -24,9 +25,10 @@ def api_stories(x, base_url):
def unix(date_str): def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()) return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
def api(route, ref=None): def api(route, ref=None, referer=None):
headers = {'Referer': referer} if referer else None
try: try:
r = requests.get(route(ref), timeout=5) r = requests.get(route(ref), headers=headers, timeout=10)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -36,7 +38,7 @@ def api(route, ref=None):
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e))) logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
try: try:
r = requests.get(route(ref), timeout=15) r = requests.get(route(ref), headers=headers, timeout=20)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -65,12 +67,14 @@ class Publication:
self.BASE_DOMAIN = domain self.BASE_DOMAIN = domain
def feed(self): def feed(self):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN)) stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []] return [str(i.get("id")) for i in stories or []]
def story(self, ref): def story(self, ref):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN)) stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@ -90,7 +94,7 @@ class Publication:
s['title'] = r.get('title', '') s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '') s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id')) comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0) s['num_comments'] = r.get('comment_count', 0)
@ -113,12 +117,14 @@ class Publication:
class Top: class Top:
def feed(self): def feed(self):
stories = api(SUBSTACK_API_TOP_POSTS) stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []] return [str(i.get("id")) for i in stories or []]
def story(self, ref): def story(self, ref):
stories = api(SUBSTACK_API_TOP_POSTS) stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@ -140,7 +146,7 @@ class Top:
s['title'] = r.get('title', '') s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '') s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, base_url), r.get('id')) comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0) s['num_comments'] = r.get('comment_count', 0)
@ -156,5 +162,4 @@ if __name__ == '__main__':
webworm = Publication("https://www.webworm.co/") webworm = Publication("https://www.webworm.co/")
posts = webworm.feed() posts = webworm.feed()
print(posts[:1])
print(webworm.story(posts[0])) print(webworm.story(posts[0]))

@ -18,6 +18,7 @@ packaging==20.4
praw==6.4.0 praw==6.4.0
prawcore==1.4.0 prawcore==1.4.0
pyparsing==2.4.7 pyparsing==2.4.7
pytz==2020.4
requests==2.24.0 requests==2.24.0
six==1.15.0 six==1.15.0
soupsieve==2.0.1 soupsieve==2.0.1
@ -29,3 +30,4 @@ websocket-client==0.57.0
Werkzeug==1.0.1 Werkzeug==1.0.1
zope.event==4.4 zope.event==4.4
zope.interface==5.1.0 zope.interface==5.1.0
python-dateutil==2.8.1

@ -0,0 +1,41 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
TIMEOUT = 30
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

@ -0,0 +1,27 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/details'
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

@ -0,0 +1,37 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
TIMEOUT = 20
def get_html(url):
details = get_details(url)
if not details:
return ''
return details['html']
def get_details(url):
try:
logging.info(f"Outline Scraper: {url}")
params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return data
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return None

@ -39,10 +39,7 @@ def update_attributes():
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2) r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
if r.status_code != 202: if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise

@ -43,8 +43,7 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(FEED_LENGTH) stories = database.get_stories(FEED_LENGTH)
# hacky nested json res = Response(json.dumps({"stories": stories}))
res = Response('{"stories":[' + ','.join(stories) + ']}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -102,8 +101,7 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
# hacky nested json res = Response(json.dumps({"story": story.data}))
res = Response('{"story":' + story.full_json + '}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -127,7 +125,7 @@ def static_story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if not story: return abort(404) if not story: return abort(404)
story = json.loads(story.full_json) story = story.data
score = story['score'] score = story['score']
num_comments = story['num_comments'] num_comments = story['num_comments']
@ -170,8 +168,7 @@ def feed_thread():
item = ref_list[news_index] item = ref_list[news_index]
try: try:
story_json = database.get_story(item['sid']).full_json story = database.get_story(item['sid']).data
story = json.loads(story_json)
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])

@ -9,19 +9,18 @@ NUM_REDDIT = 10
NUM_TILDES = 5 NUM_TILDES = 5
NUM_SUBSTACK = 10 NUM_SUBSTACK = 10
# SITEMAP = { SITEMAP = {}
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10}, # SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10}, # SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
# SUBSTACK = { SUBSTACK = {}
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10}, # SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10}, # SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
# CATEGORY = { CATEGORY = {}
# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, # CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# }
SCRAPERS = ['declutter', 'outline', 'local']
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit

@ -1,52 +1,14 @@
const port = 33843;
const express = require('express'); const express = require('express');
const app = express(); const app = express();
const port = 33843; const simple = require('./simple');
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => res.send(simple.FORM));
app.get('/', (req, res) => { app.post('/', (req, res) => simple.scrape(req, res));
res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>'); app.post('/details', (req, res) => simple.details(req, res));
}); // app.post('/browser', (req, res) => browser.scrape(req, res));
// app.post('/browser/details', (req, res) => browser.details(req, res));
const requestCallback = (url, res) => (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log('Response OK.');
const doc = new JSDOM(body, {url: url});
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (article && article.content) {
res.send(article.content);
} else {
res.sendStatus(404);
}
} else {
console.log('Response error:', error ? error.toString() : response.statusCode);
res.sendStatus(response ? response.statusCode : 404);
}
};
app.post('/', (req, res) => {
const url = req.body.url;
const requestOptions = {
url: url,
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
//headers: {'User-Agent': 'Twitterbot/1.0'},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Forwarded-For': '66.249.66.1',
},
};
console.log('Parse request for:', url);
request(requestOptions, requestCallback(url, res));
});
app.listen(port, () => { app.listen(port, () => {
console.log(`Example app listening on port ${port}!`); console.log(`Example app listening on port ${port}!`);

@ -0,0 +1,43 @@
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
const options = url => ({
url: url,
headers: {
'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
},
});
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.FORM = '<form method="POST" action="/" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>';
module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
});
module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
});

@ -72,7 +72,7 @@ class Article extends React.Component {
} }
displayComment(story, c, level) { displayComment(story, c, level) {
const cid = c.author+c.date; const cid = c.author + c.date;
const collapsed = this.state.collapsed.includes(cid); const collapsed = this.state.collapsed.includes(cid);
const expanded = this.state.expanded.includes(cid); const expanded = this.state.expanded.includes(cid);
@ -85,19 +85,22 @@ class Article extends React.Component {
<div className='info'> <div className='info'>
<p> <p>
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'} {c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
{' '} | <HashLink to={'#'+cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink> {' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
{hidden || hasChildren && {hasChildren && (
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span> hidden ?
} <span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
:
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span>
)}
</p> </p>
</div> </div>
<div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} /> <div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} />
{hidden && hasChildren ? {hidden && hasChildren ?
<div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c)-1} more]</div> <div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c) - 1} more]</div>
: :
c.comments.map(i => this.displayComment(story, i, level + 1)) c.comments.map(i => this.displayComment(story, i, level + 1))
} }
</div> </div>
@ -130,7 +133,7 @@ class Article extends React.Component {
{story.comments.map(c => this.displayComment(story, c, 0))} {story.comments.map(c => this.displayComment(story, c, 0))}
</div> </div>
</div> </div>
: :
<p>loading...</p> <p>loading...</p>
} }
<ToggleDot id={id} article={true} /> <ToggleDot id={id} article={true} />

@ -50,10 +50,6 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>
@ -62,15 +58,15 @@ class Feed extends React.Component {
{error && <p>Connection error?</p>} {error && <p>Connection error?</p>}
{stories ? {stories ?
<div> <div>
{stories.map((x, i) => {stories.map(x =>
<div className='item' key={i}> <div className='item' key={x.id}>
<div className='title'> <div className='title'>
<Link className='link' to={'/' + x.id}> <Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title} <img className='source-logo' src={logos[x.source] || logos[x.source.split(' ')[0]]} alt='source logo' /> {x.title}
</Link> </Link>
<span className='source'> <span className='source'>
&#8203;({sourceLink(x)}) ({sourceLink(x)})
</span> </span>
</div> </div>

@ -64,15 +64,15 @@ class Results extends React.Component {
<p>Search results:</p> <p>Search results:</p>
<div className='comment lined'> <div className='comment lined'>
{stories.length ? {stories.length ?
stories.map((x, i) => stories.map(x =>
<div className='item' key={i}> <div className='item' key={x.id}>
<div className='title'> <div className='title'>
<Link className='link' to={'/' + x.id}> <Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title} <img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link> </Link>
<span className='source'> <span className='source'>
&#8203;({sourceLink(x)}) ({sourceLink(x)})
</span> </span>
</div> </div>

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save