Compare commits

..

33 Commits

Author SHA1 Message Date
Jason Schwarzenberger
bfa4108a8e Merge remote-tracking branch 'tanner/master' 2020-11-09 16:08:28 +13:00
Jason Schwarzenberger
0bd0d40a31 use json type in sqlite. 2020-11-09 15:45:10 +13:00
Jason Schwarzenberger
4e04595415 fix search. 2020-11-09 15:44:44 +13:00
Jason
006db2960c change to 3 days 2020-11-09 01:36:51 +00:00
Jason Schwarzenberger
1f063f0dac undo log level change 2020-11-06 11:20:34 +13:00
Jason Schwarzenberger
1658346aa9 fix news.py feed. 2020-11-06 10:37:43 +13:00
Jason Schwarzenberger
2dbc702b40 switch to python-dateutil for parser, reverse sort xml feeds. 2020-11-06 10:02:39 +13:00
Jason Schwarzenberger
1c4764e67d sort sitemap feed by lastmod time. 2020-11-06 09:30:15 +13:00
Jason
ee49d2021e newsroom 2020-11-05 20:28:55 +00:00
Jason
c391c50ab1 use localize 2020-11-05 04:15:31 +00:00
Jason Schwarzenberger
095f0d549a use replace. 2020-11-05 16:57:08 +13:00
Jason Schwarzenberger
c21c71667e fix date issue. 2020-11-05 16:41:15 +13:00
Jason Schwarzenberger
c3a2c91a11 update requirements.txt 2020-11-05 16:33:50 +13:00
Jason Schwarzenberger
0f39446a61 tz aware for use in settings. 2020-11-05 16:30:55 +13:00
Jason Schwarzenberger
351059aab1 fix excludes. 2020-11-05 15:59:13 +13:00
Jason Schwarzenberger
4488e2c292 add an excludes list of substrings for urls in the settings for sitemap/category. 2020-11-05 15:51:59 +13:00
Jason Schwarzenberger
afda5b635c disqus test. 2020-11-05 14:23:51 +13:00
Jason Schwarzenberger
0fc1a44d2b fix issue in substack. 2020-11-04 17:40:29 +13:00
Jason Schwarzenberger
9fff1b9e46 avoid duplicate articles listed on the category page 2020-11-04 17:14:42 +13:00
Jason Schwarzenberger
16b59f6c67 try stop bad pages. 2020-11-04 16:34:31 +13:00
Jason Schwarzenberger
939f4775a7 better settings example. 2020-11-04 15:52:34 +13:00
Jason Schwarzenberger
9bfc6fc6fa scraper settings, ordering and loop. 2020-11-04 15:47:12 +13:00
Jason Schwarzenberger
6ea9844d00 remove useless try blocks. 2020-11-04 15:37:19 +13:00
Jason Schwarzenberger
1318259d3d imply referrer is substack. 2020-11-04 15:21:07 +13:00
Jason Schwarzenberger
98a0c2257c increase declutter timeout. 2020-11-04 15:15:00 +13:00
Jason Schwarzenberger
e6976db25d fix tabs 2020-11-04 15:04:20 +13:00
Jason Schwarzenberger
9edc8b7cca move scraping for article content to files. 2020-11-04 15:00:58 +13:00
Jason Schwarzenberger
33e21e7f30 fix mistake. 2020-11-04 12:45:01 +13:00
Jason Schwarzenberger
892a99eca6 add + expander in place of collapser. 2020-11-04 12:43:15 +13:00
Jason Schwarzenberger
d718d05a04 fix dates for newsroom. 2020-11-04 11:53:16 +13:00
Jason Schwarzenberger
d1795eb1b8 add radionz and newsroom logos. 2020-11-04 11:30:56 +13:00
9a279d44b1 Add header to get content type 2020-11-03 20:27:43 +00:00
e506804666 Clean code up 2020-11-03 03:45:56 +00:00
17 changed files with 376 additions and 216 deletions

View File

@ -4,6 +4,7 @@ from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.types import JSON
engine = create_engine('sqlite:///data/qotnews.sqlite') engine = create_engine('sqlite:///data/qotnews.sqlite')
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
@ -15,8 +16,8 @@ class Story(Base):
sid = Column(String(16), primary_key=True) sid = Column(String(16), primary_key=True)
ref = Column(String(16), unique=True) ref = Column(String(16), unique=True)
meta_json = Column(String) meta = Column(JSON)
full_json = Column(String) data = Column(JSON)
title = Column(String) title = Column(String)
class Reflist(Base): class Reflist(Base):
@ -36,19 +37,21 @@ def get_story(sid):
def put_story(story): def put_story(story):
story = story.copy() story = story.copy()
full_json = json.dumps(story) data = {}
data.update(story)
story.pop('text', None) meta = {}
story.pop('comments', None) meta.update(story)
meta_json = json.dumps(story) meta.pop('text', None)
meta.pop('comments', None)
try: try:
session = Session() session = Session()
s = Story( s = Story(
sid=story['id'], sid=story['id'],
ref=story['ref'], ref=story['ref'],
full_json=full_json, data=data,
meta_json=meta_json, meta=meta,
title=story.get('title', None), title=story.get('title', None),
) )
session.merge(s) session.merge(s)
@ -70,10 +73,10 @@ def get_reflist(amount):
def get_stories(amount): def get_stories(amount):
session = Session() session = Session()
q = session.query(Reflist, Story.meta_json).\ q = session.query(Reflist, Story.meta).\
order_by(Reflist.rid.desc()).\
join(Story).\ join(Story).\
filter(Story.title != None).\ filter(Story.title != None).\
order_by(Story.meta['date'].desc()).\
limit(amount) limit(amount)
return [x[1] for x in q] return [x[1] for x in q]

View File

@ -9,22 +9,23 @@ from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local
OUTLINE_API = 'https://api.outline.com/v3/parse_article' ONE_HOUR = 60*60
READ_API = 'http://127.0.0.1:33843' ONE_DAY = 24*ONE_HOUR
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 MAX_AGE_IN_DAYS = 3*ONE_DAY
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url']) substacks[key] = substack.Publication(value['url'])
categories = {} categories = {}
for key, value in settings.CATEGORY.items(): for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url']) categories[key] = news.Category(value['url'], value.get('tz'))
sitemaps = {} sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url']) sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
def list(): def list():
feed = [] feed = []
@ -45,53 +46,49 @@ def list():
feed += [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items(): for key, sites in categories.items():
count = settings.CATEGORY[key]['count'] count = settings.CATEGORY[key].get('count') or 0
feed += [(x, key) for x in sites.feed()[:count]] excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key]['count'] count = settings.SITEMAP[key].get('count') or 0
feed += [(x, key) for x in sites.feed()[:count]] excludes = settings.SITEMAP[key].get('excludes')
feed += [(x, key) for x in sites.feed(excludes)[:count]]
return feed return feed
def get_article(url): def get_article(url):
scrapers = {
'declutter': declutter,
'outline': outline,
'local': local,
}
available = settings.SCRAPERS or ['local']
if 'local' not in available:
available += ['local']
for scraper in available:
if scraper not in scrapers.keys():
continue
try: try:
params = {'source_url': url} html = scrapers[scraper].get_html(url)
headers = {'Referer': 'https://outline.com/'} if html:
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
html = r.json()['data']['html']
if 'URL is not supported by Outline' in html:
raise Exception('URL not supported by Outline')
return html return html
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except:
logging.error('Problem outlining article: {}'.format(str(e))) pass
logging.info('Trying our server instead...')
try:
r = requests.post(READ_API, data=dict(url=url), timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return '' return ''
def get_content_type(url): def get_content_type(url):
try: try:
headers = {'User-Agent': 'Twitterbot/1.0'} headers = {
return requests.get(url, headers=headers, timeout=2).headers['content-type'] 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
}
return requests.get(url, headers=headers, timeout=5).headers['content-type']
except: except:
pass pass
@ -127,7 +124,7 @@ def update_story(story, is_manual=False):
logging.info('Story not ready yet') logging.info('Story not ready yet')
return False return False
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time(): if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
logging.info('Story too old, removing') logging.info('Story too old, removing')
return False return False

View File

@ -10,29 +10,27 @@ if __name__ == '__main__':
import requests import requests
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapers import declutter
import dateutil.parser
import extruct import extruct
import pytz
from utils import clean from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
def unix(date_str): def unix(date_str, tz=None):
date_tzfix = date_str
if ":" == date_tzfix[-3]:
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
for f in formats:
try: try:
return int(datetime.strptime(date_str, f).timestamp()) dt = dateutil.parser.parse(date_str)
except: if tz:
pass dt = pytz.timezone(tz).localize(dt)
try: return int(dt.timestamp())
return int(datetime.strptime(date_tzfix, f).timestamp())
except: except:
pass pass
return 0 return 0
def xml(route, ref=None): def xml(route, ref=None):
try: try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
@ -46,6 +44,7 @@ def xml(route, ref=None):
logging.error('Problem hitting URL: {}'.format(str(e))) logging.error('Problem hitting URL: {}'.format(str(e)))
return False return False
def parse_extruct(s, data): def parse_extruct(s, data):
for rdfa in data['rdfa']: for rdfa in data['rdfa']:
for key, props in rdfa.items(): for key, props in rdfa.items():
@ -54,22 +53,19 @@ def parse_extruct(s, data):
s['title'] = values['@value'] s['title'] = values['@value']
if 'http://ogp.me/ns/article#modified_time' in props: if 'http://ogp.me/ns/article#modified_time' in props:
for values in props['http://ogp.me/ns/article#modified_time']: for values in props['http://ogp.me/ns/article#modified_time']:
print(f"modified_time: {values['@value']}") s['date'] = values['@value']
s['date'] = unix(values['@value'])
if 'http://ogp.me/ns/article#published_time' in props: if 'http://ogp.me/ns/article#published_time' in props:
for values in props['http://ogp.me/ns/article#published_time']: for values in props['http://ogp.me/ns/article#published_time']:
print(f"published_time: {values['@value']}") s['date'] = values['@value']
s['date'] = unix(values['@value'])
for og in data['opengraph']: for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']])) modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']])) published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified): if len(modified):
s['date'] = unix(modified[0]) s['date'] = modified[0]
if len(published): if len(published):
s['date'] = unix(published[0]) s['date'] = published[0]
s['date'] = unix(published[0] or modified[0] or '')
if len(titles): if len(titles):
s['title'] = titles[0] s['title'] = titles[0]
@ -78,35 +74,56 @@ def parse_extruct(s, data):
props = md['properties'] props = md['properties']
s['title'] = props['headline'] s['title'] = props['headline']
if props['dateModified']: if props['dateModified']:
s['date'] = unix(props['dateModified']) s['date'] = props['dateModified']
if props['datePublished']: if props['datePublished']:
s['date'] = unix(props['datePublished']) s['date'] = props['datePublished']
if 'author' in props and props['author']: if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name'] s['author'] = props['author']['properties']['name']
for ld in data['json-ld']: for ld in data['json-ld']:
if ld['@type'] == 'Article': if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline'] s['title'] = ld['headline']
if ld['dateModified']: if ld['dateModified']:
s['date'] = unix(ld['dateModified']) s['date'] = ld['dateModified']
if ld['datePublished']: if ld['datePublished']:
s['date'] = unix(ld['datePublished']) s['date'] = ld['datePublished']
if 'author' in ld and ld['author']: if 'author' in ld and ld['author']:
s['author'] = ld['author']['name'] s['author'] = ld['author']['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s return s
class Sitemap: def comment(i):
def __init__(self, url): if 'author' not in i:
self.sitemap_url = url return False
def feed(self): c = {}
markup = xml(lambda x: self.sitemap_url) c['author'] = i.get('author', '')
if not markup: return [] c['score'] = i.get('points', 0)
soup = BeautifulSoup(markup, features='lxml') c['date'] = unix(i.get('date', 0))
articles = soup.find('urlset').findAll('url') c['text'] = clean(i.get('text', '') or '')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) c['comments'] = [comment(j) for j in i['children']]
return [x.find('loc').text for x in articles] or [] c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class _Base:
def __init__(url, tz=None):
self.url = url
self.tz = tz
def feed(self, excludes=None):
return []
def story(self, ref): def story(self, ref):
markup = xml(lambda x: ref) markup = xml(lambda x: ref)
@ -124,14 +141,58 @@ class Sitemap:
data = extruct.extract(markup) data = extruct.extract(markup)
s = parse_extruct(s, data) s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(ref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s return s
class Category: def get_sitemap_date(a):
def __init__(self, url): if a.find('lastmod'):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
return ''
class Sitemap(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.sitemap_url = url
def feed(self, excludes=None):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
class Category(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.category_url = url self.category_url = url
self.base_url = '/'.join(url.split('/')[:3]) self.base_url = '/'.join(url.split('/')[:3])
def feed(self): def feed(self, excludes=None):
markup = xml(lambda x: self.category_url) markup = xml(lambda x: self.category_url)
if not markup: return [] if not markup: return []
soup = BeautifulSoup(markup, features='html.parser') soup = BeautifulSoup(markup, features='html.parser')
@ -139,42 +200,30 @@ class Category:
links = [link.get('href') for link in links] links = [link.get('href') for link in links]
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links] links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links])) links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
links = list(filter(None, [link if link != self.category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
print("Sitemap: Stuff") print("Sitemap: Stuff")
site = Sitemap("https://www.stuff.co.nz/sitemap.xml") site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:5])
print(site.story(posts[0]))
print("Sitemap: NZ Herald")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori") print("Category: RadioNZ Te Ao Māori")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/") site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Sitemap: Newsroom")
site = Sitemap("https://www.newsroom.co.nz/sitemap.xml")
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))

View File

@ -12,6 +12,7 @@ from datetime import datetime
from utils import clean from utils import clean
SUBSTACK_REFERER = 'https://substack.com'
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts" SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
def author_link(author_id, base_url): def author_link(author_id, base_url):
@ -24,9 +25,10 @@ def api_stories(x, base_url):
def unix(date_str): def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()) return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
def api(route, ref=None): def api(route, ref=None, referer=None):
headers = {'Referer': referer} if referer else None
try: try:
r = requests.get(route(ref), timeout=5) r = requests.get(route(ref), headers=headers, timeout=10)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -36,7 +38,7 @@ def api(route, ref=None):
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e))) logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
try: try:
r = requests.get(route(ref), timeout=15) r = requests.get(route(ref), headers=headers, timeout=20)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -65,12 +67,14 @@ class Publication:
self.BASE_DOMAIN = domain self.BASE_DOMAIN = domain
def feed(self): def feed(self):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN)) stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []] return [str(i.get("id")) for i in stories or []]
def story(self, ref): def story(self, ref):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN)) stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@ -90,7 +94,7 @@ class Publication:
s['title'] = r.get('title', '') s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '') s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id')) comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0) s['num_comments'] = r.get('comment_count', 0)
@ -113,12 +117,14 @@ class Publication:
class Top: class Top:
def feed(self): def feed(self):
stories = api(SUBSTACK_API_TOP_POSTS) stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
if not stories: return []
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []] return [str(i.get("id")) for i in stories or []]
def story(self, ref): def story(self, ref):
stories = api(SUBSTACK_API_TOP_POSTS) stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
if not stories: return False
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@ -140,7 +146,7 @@ class Top:
s['title'] = r.get('title', '') s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '') s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, base_url), r.get('id')) comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0) s['num_comments'] = r.get('comment_count', 0)
@ -156,5 +162,4 @@ if __name__ == '__main__':
webworm = Publication("https://www.webworm.co/") webworm = Publication("https://www.webworm.co/")
posts = webworm.feed() posts = webworm.feed()
print(posts[:1])
print(webworm.story(posts[0])) print(webworm.story(posts[0]))

View File

@ -18,6 +18,7 @@ packaging==20.4
praw==6.4.0 praw==6.4.0
prawcore==1.4.0 prawcore==1.4.0
pyparsing==2.4.7 pyparsing==2.4.7
pytz==2020.4
requests==2.24.0 requests==2.24.0
six==1.15.0 six==1.15.0
soupsieve==2.0.1 soupsieve==2.0.1
@ -29,3 +30,4 @@ websocket-client==0.57.0
Werkzeug==1.0.1 Werkzeug==1.0.1
zope.event==4.4 zope.event==4.4
zope.interface==5.1.0 zope.interface==5.1.0
python-dateutil==2.8.1

View File

@ -0,0 +1,41 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
TIMEOUT = 30
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@ -0,0 +1,27 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/details'
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

View File

@ -0,0 +1,37 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
TIMEOUT = 20
def get_html(url):
details = get_details(url)
if not details:
return ''
return details['html']
def get_details(url):
try:
logging.info(f"Outline Scraper: {url}")
params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return data
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return None

View File

@ -39,10 +39,7 @@ def update_attributes():
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2) r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
if r.status_code != 202: if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise

View File

@ -43,8 +43,7 @@ cors = CORS(flask_app)
@flask_app.route('/api') @flask_app.route('/api')
def api(): def api():
stories = database.get_stories(FEED_LENGTH) stories = database.get_stories(FEED_LENGTH)
# hacky nested json res = Response(json.dumps({"stories": stories}))
res = Response('{"stories":[' + ','.join(stories) + ']}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
@ -102,8 +101,7 @@ def submit():
def story(sid): def story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if story: if story:
# hacky nested json res = Response(json.dumps({"story": story.data}))
res = Response('{"story":' + story.full_json + '}')
res.headers['content-type'] = 'application/json' res.headers['content-type'] = 'application/json'
return res return res
else: else:
@ -127,7 +125,7 @@ def static_story(sid):
story = database.get_story(sid) story = database.get_story(sid)
if not story: return abort(404) if not story: return abort(404)
story = json.loads(story.full_json) story = story.data
score = story['score'] score = story['score']
num_comments = story['num_comments'] num_comments = story['num_comments']
@ -170,8 +168,7 @@ def feed_thread():
item = ref_list[news_index] item = ref_list[news_index]
try: try:
story_json = database.get_story(item['sid']).full_json story = database.get_story(item['sid']).data
story = json.loads(story_json)
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])

View File

@ -9,19 +9,18 @@ NUM_REDDIT = 10
NUM_TILDES = 5 NUM_TILDES = 5
NUM_SUBSTACK = 10 NUM_SUBSTACK = 10
# SITEMAP = { SITEMAP = {}
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10}, # SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10}, # SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
# SUBSTACK = { SUBSTACK = {}
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10}, # SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10}, # SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
# CATEGORY = { CATEGORY = {}
# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, # CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# }
SCRAPERS = ['declutter', 'outline', 'local']
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit

View File

@ -1,52 +1,14 @@
const port = 33843;
const express = require('express'); const express = require('express');
const app = express(); const app = express();
const port = 33843; const simple = require('./simple');
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => res.send(simple.FORM));
app.get('/', (req, res) => { app.post('/', (req, res) => simple.scrape(req, res));
res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>'); app.post('/details', (req, res) => simple.details(req, res));
}); // app.post('/browser', (req, res) => browser.scrape(req, res));
// app.post('/browser/details', (req, res) => browser.details(req, res));
const requestCallback = (url, res) => (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log('Response OK.');
const doc = new JSDOM(body, {url: url});
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (article && article.content) {
res.send(article.content);
} else {
res.sendStatus(404);
}
} else {
console.log('Response error:', error ? error.toString() : response.statusCode);
res.sendStatus(response ? response.statusCode : 404);
}
};
app.post('/', (req, res) => {
const url = req.body.url;
const requestOptions = {
url: url,
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
//headers: {'User-Agent': 'Twitterbot/1.0'},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Forwarded-For': '66.249.66.1',
},
};
console.log('Parse request for:', url);
request(requestOptions, requestCallback(url, res));
});
app.listen(port, () => { app.listen(port, () => {
console.log(`Example app listening on port ${port}!`); console.log(`Example app listening on port ${port}!`);

43
readerserver/simple.js Normal file
View File

@ -0,0 +1,43 @@
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
const options = url => ({
url: url,
headers: {
'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
},
});
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.FORM = '<form method="POST" action="/" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>';
module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
});
module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
});

View File

@ -87,9 +87,12 @@ class Article extends React.Component {
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'} {c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
{' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink> {' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
{hidden || hasChildren && {hasChildren && (
hidden ?
<span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
:
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span> <span className='collapser pointer' onClick={() => this.collapseComment(cid)}></span>
} )}
</p> </p>
</div> </div>

View File

@ -50,10 +50,6 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>
@ -62,15 +58,15 @@ class Feed extends React.Component {
{error && <p>Connection error?</p>} {error && <p>Connection error?</p>}
{stories ? {stories ?
<div> <div>
{stories.map((x, i) => {stories.map(x =>
<div className='item' key={i}> <div className='item' key={x.id}>
<div className='title'> <div className='title'>
<Link className='link' to={'/' + x.id}> <Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title} <img className='source-logo' src={logos[x.source] || logos[x.source.split(' ')[0]]} alt='source logo' /> {x.title}
</Link> </Link>
<span className='source'> <span className='source'>
&#8203;({sourceLink(x)}) ({sourceLink(x)})
</span> </span>
</div> </div>

View File

@ -64,15 +64,15 @@ class Results extends React.Component {
<p>Search results:</p> <p>Search results:</p>
<div className='comment lined'> <div className='comment lined'>
{stories.length ? {stories.length ?
stories.map((x, i) => stories.map(x =>
<div className='item' key={i}> <div className='item' key={x.id}>
<div className='title'> <div className='title'>
<Link className='link' to={'/' + x.id}> <Link className='link' to={'/' + x.id}>
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title} <img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
</Link> </Link>
<span className='source'> <span className='source'>
&#8203;({sourceLink(x)}) ({sourceLink(x)})
</span> </span>
</div> </div>

File diff suppressed because one or more lines are too long