Compare commits

..

No commits in common. "bfa4108a8e532e704c445e2684ae7cbc3003c362" and "9f4ff4acf08e33194509018726b7ddbeaf651656" have entirely different histories.

17 changed files with 216 additions and 376 deletions

View File

@ -4,7 +4,6 @@ from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.types import JSON
engine = create_engine('sqlite:///data/qotnews.sqlite') engine = create_engine('sqlite:///data/qotnews.sqlite')
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
@ -16,8 +15,8 @@ class Story(Base):
sid = Column(String(16), primary_key=True) sid = Column(String(16), primary_key=True)
ref = Column(String(16), unique=True) ref = Column(String(16), unique=True)
meta = Column(JSON) meta_json = Column(String)
data = Column(JSON) full_json = Column(String)
title = Column(String) title = Column(String)
class Reflist(Base): class Reflist(Base):
@ -37,21 +36,19 @@ def get_story(sid):
def put_story(story): def put_story(story):
story = story.copy() story = story.copy()
data = {} full_json = json.dumps(story)
data.update(story)
meta = {} story.pop('text', None)
meta.update(story) story.pop('comments', None)
meta.pop('text', None) meta_json = json.dumps(story)
meta.pop('comments', None)
try: try:
session = Session() session = Session()
s = Story( s = Story(
sid=story['id'], sid=story['id'],
ref=story['ref'], ref=story['ref'],
data=data, full_json=full_json,
meta=meta, meta_json=meta_json,
title=story.get('title', None), title=story.get('title', None),
) )
session.merge(s) session.merge(s)
@ -73,10 +70,10 @@ def get_reflist(amount):
def get_stories(amount): def get_stories(amount):
session = Session() session = Session()
q = session.query(Reflist, Story.meta).\ q = session.query(Reflist, Story.meta_json).\
order_by(Reflist.rid.desc()).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
order_by(Story.meta['date'].desc()).\
limit(amount) limit(amount)
return [x[1] for x in q] return [x[1] for x in q]

View File

@ -9,23 +9,22 @@ from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local
ONE_HOUR = 60*60 OUTLINE_API = 'https://api.outline.com/v3/parse_article'
ONE_DAY = 24*ONE_HOUR READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
MAX_AGE_IN_DAYS = 3*ONE_DAY TWO_DAYS = 60*60*24*2
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url']) substacks[key] = substack.Publication(value['url'])
categories = {} categories = {}
for key, value in settings.CATEGORY.items(): for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url'], value.get('tz')) categories[key] = news.Category(value['url'])
sitemaps = {} sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) sitemaps[key] = news.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
@ -46,49 +45,53 @@ def list():
feed += [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0 count = settings.CATEGORY[key]['count']
excludes = settings.CATEGORY[key].get('excludes') feed += [(x, key) for x in sites.feed()[:count]]
tz = settings.CATEGORY[key].get('tz')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0 count = settings.SITEMAP[key]['count']
excludes = settings.SITEMAP[key].get('excludes') feed += [(x, key) for x in sites.feed()[:count]]
feed += [(x, key) for x in sites.feed(excludes)[:count]]
return feed return feed
def get_article(url): def get_article(url):
scrapers = {
'declutter': declutter,
'outline': outline,
'local': local,
}
available = settings.SCRAPERS or ['local']
if 'local' not in available:
available += ['local']
for scraper in available:
if scraper not in scrapers.keys():
continue
try: try:
html = scrapers[scraper].get_html(url) params = {'source_url': url}
if html: headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
html = r.json()['data']['html']
if 'URL is not supported by Outline' in html:
raise Exception('URL not supported by Outline')
return html return html
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except: except BaseException as e:
pass logging.error('Problem outlining article: {}'.format(str(e)))
logging.info('Trying our server instead...')
try:
r = requests.post(READ_API, data=dict(url=url), timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return '' return ''
def get_content_type(url): def get_content_type(url):
try: try:
headers = { headers = {'User-Agent': 'Twitterbot/1.0'}
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', return requests.get(url, headers=headers, timeout=2).headers['content-type']
'X-Forwarded-For': '66.249.66.1',
}
return requests.get(url, headers=headers, timeout=5).headers['content-type']
except: except:
pass pass
@ -124,7 +127,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

View File

@ -10,27 +10,29 @@ if __name__ == '__main__':
import requests import requests
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapers import declutter
import dateutil.parser
import extruct import extruct
import pytz
from utils import clean from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
def unix(date_str, tz=None): def unix(date_str):
date_tzfix = date_str
if ":" == date_tzfix[-3]:
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
for f in formats:
try: try:
dt = dateutil.parser.parse(date_str) return int(datetime.strptime(date_str, f).timestamp())
if tz: except:
dt = pytz.timezone(tz).localize(dt) pass
return int(dt.timestamp()) try:
return int(datetime.strptime(date_tzfix, f).timestamp())
except: except:
pass pass
return 0 return 0
def xml(route, ref=None): def xml(route, ref=None):
try: try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
@ -44,7 +46,6 @@ def xml(route, ref=None):
logging.error('Problem hitting URL: {}'.format(str(e))) logging.error('Problem hitting URL: {}'.format(str(e)))
return False return False
def parse_extruct(s, data): def parse_extruct(s, data):
for rdfa in data['rdfa']: for rdfa in data['rdfa']:
for key, props in rdfa.items(): for key, props in rdfa.items():
@ -53,19 +54,22 @@ def parse_extruct(s, data):
s['title'] = values['@value'] s['title'] = values['@value']
if 'http://ogp.me/ns/article#modified_time' in props: if 'http://ogp.me/ns/article#modified_time' in props:
for values in props['http://ogp.me/ns/article#modified_time']: for values in props['http://ogp.me/ns/article#modified_time']:
s['date'] = values['@value'] print(f"modified_time: {values['@value']}")
s['date'] = unix(values['@value'])
if 'http://ogp.me/ns/article#published_time' in props: if 'http://ogp.me/ns/article#published_time' in props:
for values in props['http://ogp.me/ns/article#published_time']: for values in props['http://ogp.me/ns/article#published_time']:
s['date'] = values['@value'] print(f"published_time: {values['@value']}")
s['date'] = unix(values['@value'])
for og in data['opengraph']: for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']])) modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']])) published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified): if len(modified):
s['date'] = modified[0] s['date'] = unix(modified[0])
if len(published): if len(published):
s['date'] = published[0] s['date'] = unix(published[0])
s['date'] = unix(published[0] or modified[0] or '')
if len(titles): if len(titles):
s['title'] = titles[0] s['title'] = titles[0]
@ -74,56 +78,35 @@ def parse_extruct(s, data):
props = md['properties'] props = md['properties']
s['title'] = props['headline'] s['title'] = props['headline']
if props['dateModified']: if props['dateModified']:
s['date'] = props['dateModified'] s['date'] = unix(props['dateModified'])
if props['datePublished']: if props['datePublished']:
s['date'] = props['datePublished'] s['date'] = unix(props['datePublished'])
if 'author' in props and props['author']: if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name'] s['author'] = props['author']['properties']['name']
for ld in data['json-ld']: for ld in data['json-ld']:
if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']: if ld['@type'] == 'Article':
s['title'] = ld['headline'] s['title'] = ld['headline']
if ld['dateModified']: if ld['dateModified']:
s['date'] = ld['dateModified'] s['date'] = unix(ld['dateModified'])
if ld['datePublished']: if ld['datePublished']:
s['date'] = ld['datePublished'] s['date'] = unix(ld['datePublished'])
if 'author' in ld and ld['author']: if 'author' in ld and ld['author']:
s['author'] = ld['author']['name'] s['author'] = ld['author']['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s return s
def comment(i): class Sitemap:
if 'author' not in i: def __init__(self, url):
return False self.sitemap_url = url
c = {} def feed(self):
c['author'] = i.get('author', '') markup = xml(lambda x: self.sitemap_url)
c['score'] = i.get('points', 0) if not markup: return []
c['date'] = unix(i.get('date', 0)) soup = BeautifulSoup(markup, features='lxml')
c['text'] = clean(i.get('text', '') or '') articles = soup.find('urlset').findAll('url')
c['comments'] = [comment(j) for j in i['children']] articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
c['comments'] = list(filter(bool, c['comments'])) return [x.find('loc').text for x in articles] or []
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class _Base:
def __init__(url, tz=None):
self.url = url
self.tz = tz
def feed(self, excludes=None):
return []
def story(self, ref): def story(self, ref):
markup = xml(lambda x: ref) markup = xml(lambda x: ref)
@ -141,58 +124,14 @@ class _Base:
data = extruct.extract(markup) data = extruct.extract(markup)
s = parse_extruct(s, data) s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(ref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s return s
def get_sitemap_date(a): class Category:
if a.find('lastmod'): def __init__(self, url):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
return ''
class Sitemap(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.sitemap_url = url
def feed(self, excludes=None):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
class Category(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.category_url = url self.category_url = url
self.base_url = '/'.join(url.split('/')[:3]) self.base_url = '/'.join(url.split('/')[:3])
def feed(self, excludes=None): def feed(self):
markup = xml(lambda x: self.category_url) markup = xml(lambda x: self.category_url)
if not markup: return [] if not markup: return []
soup = BeautifulSoup(markup, features='html.parser') soup = BeautifulSoup(markup, features='html.parser')
@ -200,30 +139,42 @@ class Category(_Base):
links = [link.get('href') for link in links] links = [link.get('href') for link in links]
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links] links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links])) links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
links = list(filter(None, [link if link != self.category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
print("Sitemap: Stuff") print("Sitemap: Stuff")
site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml") site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
posts = site.feed() posts = site.feed()
print(posts[:5]) print(posts[:1])
print(site.story(posts[0]))
print("Sitemap: NZ Herald")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori") print("Category: RadioNZ Te Ao Māori")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/") site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed() posts = site.feed()
print(posts[:5]) print(posts[:1])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Sitemap: Newsroom")
site = Sitemap("https://www.newsroom.co.nz/sitemap.xml")
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))

View File

@ -12,7 +12,6 @@ from datetime import datetime
from utils import clean from utils import clean
SUBSTACK_REFERER = 'https://substack.com'
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts" SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
def author_link(author_id, base_url): def author_link(author_id, base_url):
@ -25,10 +24,9 @@ def api_stories(x, base_url):
def unix(date_str): def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()) return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
def api(route, ref=None, referer=None): def api(route, ref=None):
headers = {'Referer': referer} if referer else None
try: try:
r = requests.get(route(ref), headers=headers, timeout=10) r = requests.get(route(ref), timeout=5)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -38,7 +36,7 @@ def api(route, ref=None, referer=None):
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e))) logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
try: try:
r = requests.get(route(ref), headers=headers, timeout=20) r = requests.get(route(ref), timeout=15)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -67,14 +65,12 @@ class Publication:
self.BASE_DOMAIN = domain self.BASE_DOMAIN = domain
def feed(self): def feed(self):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN) stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []] return [str(i.get("id")) for i in stories or []]
def story(self, ref): def story(self, ref):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN) stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@ -94,7 +90,7 @@ class Publication:
s['title'] = r.get('title', '') s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '') s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN) comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'))
s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0) s['num_comments'] = r.get('comment_count', 0)
@ -117,14 +113,12 @@ class Publication:
class Top: class Top:
def feed(self): def feed(self):
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER) stories = api(SUBSTACK_API_TOP_POSTS)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []] return [str(i.get("id")) for i in stories or []]
def story(self, ref): def story(self, ref):
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER) stories = api(SUBSTACK_API_TOP_POSTS)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@ -146,7 +140,7 @@ class Top:
s['title'] = r.get('title', '') s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '') s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER) comments = api(lambda x: api_comments(x, base_url), r.get('id'))
s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0) s['num_comments'] = r.get('comment_count', 0)
@ -162,4 +156,5 @@ if __name__ == '__main__':
webworm = Publication("https://www.webworm.co/") webworm = Publication("https://www.webworm.co/")
posts = webworm.feed() posts = webworm.feed()
print(posts[:1])
print(webworm.story(posts[0])) print(webworm.story(posts[0]))

View File

@ -18,7 +18,6 @@ packaging==20.4
praw==6.4.0 praw==6.4.0
prawcore==1.4.0 prawcore==1.4.0
pyparsing==2.4.7 pyparsing==2.4.7
pytz==2020.4
requests==2.24.0 requests==2.24.0
six==1.15.0 six==1.15.0
soupsieve==2.0.1 soupsieve==2.0.1
@ -30,4 +29,3 @@ websocket-client==0.57.0
Werkzeug==1.0.1 Werkzeug==1.0.1
zope.event==4.4 zope.event==4.4
zope.interface==5.1.0 zope.interface==5.1.0
python-dateutil==2.8.1

View File

@ -1,41 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
TIMEOUT = 30
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@ -1,27 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/details'
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

View File

@ -1,37 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
TIMEOUT = 20
def get_html(url):
details = get_details(url)
if not details:
return ''
return details['html']
def get_details(url):
try:
logging.info(f"Outline Scraper: {url}")
params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return data
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return None

View File

@ -39,7 +39,10 @@ def update_attributes():
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2) r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
if r.status_code != 202: if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2) return r.json()
r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise

View File

@ -43,7 +43,8 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(FEED_LENGTH) stories = database.get_stories(FEED_LENGTH)
res = Response(json.dumps({"stories": stories})) # hacky nested json
res = Response('{"stories":[' + ','.join(stories) + ']}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -101,7 +102,8 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
res = Response(json.dumps({"story": story.data})) # hacky nested json
res = Response('{"story":' + story.full_json + '}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -125,7 +127,7 @@ def static_story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if not story: return abort(404) if not story: return abort(404)
story = story.data story = json.loads(story.full_json)
score = story['score'] score = story['score']
num_comments = story['num_comments'] num_comments = story['num_comments']
@ -168,7 +170,8 @@ def feed_thread():
item = ref_list[news_index] item = ref_list[news_index]
try: try:
story = database.get_story(item['sid']).data story_json = database.get_story(item['sid']).full_json
story = json.loads(story_json)
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])

View File

@ -9,18 +9,19 @@ NUM_REDDIT = 10
NUM_TILDES = 5 NUM_TILDES = 5
NUM_SUBSTACK = 10 NUM_SUBSTACK = 10
SITEMAP = {} # SITEMAP = {
# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10}, # 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10}, # 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
SUBSTACK = {} # SUBSTACK = {
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10}, # 'webworm': { 'url': "https://www.webworm.co", 'count': 10},
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10}, # 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
CATEGORY = {} # CATEGORY = {
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, # 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# }
SCRAPERS = ['declutter', 'outline', 'local']
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit

View File

@ -1,14 +1,52 @@
const port = 33843;
const express = require('express'); const express = require('express');
const app = express(); const app = express();
const simple = require('./simple'); const port = 33843;
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => res.send(simple.FORM));
app.post('/', (req, res) => simple.scrape(req, res)); app.get('/', (req, res) => {
app.post('/details', (req, res) => simple.details(req, res)); res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>');
// app.post('/browser', (req, res) => browser.scrape(req, res)); });
// app.post('/browser/details', (req, res) => browser.details(req, res));
const requestCallback = (url, res) => (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log('Response OK.');
const doc = new JSDOM(body, {url: url});
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (article && article.content) {
res.send(article.content);
} else {
res.sendStatus(404);
}
} else {
console.log('Response error:', error ? error.toString() : response.statusCode);
res.sendStatus(response ? response.statusCode : 404);
}
};
app.post('/', (req, res) => {
const url = req.body.url;
const requestOptions = {
url: url,
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
//headers: {'User-Agent': 'Twitterbot/1.0'},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Forwarded-For': '66.249.66.1',
},
};
console.log('Parse request for:', url);
request(requestOptions, requestCallback(url, res));
});
app.listen(port, () => { app.listen(port, () => {
console.log(`Example app listening on port ${port}!`); console.log(`Example app listening on port ${port}!`);

View File

@ -1,43 +0,0 @@
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
const options = url => ({
url: url,
headers: {
'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
},
});
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.FORM = '<form method="POST" action="/" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>';
module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
});
module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
});

View File

@ -87,12 +87,9 @@ class Article extends React.Component {
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'} {c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
{' '} | <HashLink to={'#'+cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink> {' '} | <HashLink to={'#'+cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
{hasChildren && ( {hidden || hasChildren &&
hidden ?
<span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
:
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span> <span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span>
)} }
</p> </p>
</div> </div>

View File

@ -50,6 +50,10 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>
@ -58,15 +62,15 @@ class Feed extends React.Component {
{error && <p>Connection error?</p>} {error && <p>Connection error?</p>}
{stories ? {stories ?
<div> <div>
{stories.map(x => {stories.map((x, i) =>
<div className='item' key={x.id}> <div className='item' key={i}>
<div className='title'> <div className='title'>
<Link className='link' to={'/' + x.id}> <Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source] || logos[x.source.split(' ')[0]]} alt='source logo' /> {x.title} <img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link> </Link>
<span className='source'> <span className='source'>
({sourceLink(x)}) &#8203;({sourceLink(x)})
</span> </span>
</div> </div>

View File

@ -64,15 +64,15 @@ class Results extends React.Component {
<p>Search results:</p> <p>Search results:</p>
<div className='comment lined'> <div className='comment lined'>
{stories.length ? {stories.length ?
stories.map(x => stories.map((x, i) =>
<div className='item' key={x.id}> <div className='item' key={i}>
<div className='title'> <div className='title'>
<Link className='link' to={'/' + x.id}> <Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title} <img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link> </Link>
<span className='source'> <span className='source'>
({sourceLink(x)}) &#8203;({sourceLink(x)})
</span> </span>
</div> </div>

File diff suppressed because one or more lines are too long