forked from tanner/qotnews
Compare commits
33 Commits
9f4ff4acf0
...
bfa4108a8e
Author | SHA1 | Date | |
---|---|---|---|
|
bfa4108a8e | ||
|
0bd0d40a31 | ||
|
4e04595415 | ||
|
006db2960c | ||
|
1f063f0dac | ||
|
1658346aa9 | ||
|
2dbc702b40 | ||
|
1c4764e67d | ||
|
ee49d2021e | ||
|
c391c50ab1 | ||
|
095f0d549a | ||
|
c21c71667e | ||
|
c3a2c91a11 | ||
|
0f39446a61 | ||
|
351059aab1 | ||
|
4488e2c292 | ||
|
afda5b635c | ||
|
0fc1a44d2b | ||
|
9fff1b9e46 | ||
|
16b59f6c67 | ||
|
939f4775a7 | ||
|
9bfc6fc6fa | ||
|
6ea9844d00 | ||
|
1318259d3d | ||
|
98a0c2257c | ||
|
e6976db25d | ||
|
9edc8b7cca | ||
|
33e21e7f30 | ||
|
892a99eca6 | ||
|
d718d05a04 | ||
|
d1795eb1b8 | ||
9a279d44b1 | |||
e506804666 |
|
@ -4,6 +4,7 @@ from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
from sqlalchemy.types import JSON
|
||||||
|
|
||||||
engine = create_engine('sqlite:///data/qotnews.sqlite')
|
engine = create_engine('sqlite:///data/qotnews.sqlite')
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=engine)
|
||||||
|
@ -15,8 +16,8 @@ class Story(Base):
|
||||||
|
|
||||||
sid = Column(String(16), primary_key=True)
|
sid = Column(String(16), primary_key=True)
|
||||||
ref = Column(String(16), unique=True)
|
ref = Column(String(16), unique=True)
|
||||||
meta_json = Column(String)
|
meta = Column(JSON)
|
||||||
full_json = Column(String)
|
data = Column(JSON)
|
||||||
title = Column(String)
|
title = Column(String)
|
||||||
|
|
||||||
class Reflist(Base):
|
class Reflist(Base):
|
||||||
|
@ -36,19 +37,21 @@ def get_story(sid):
|
||||||
|
|
||||||
def put_story(story):
|
def put_story(story):
|
||||||
story = story.copy()
|
story = story.copy()
|
||||||
full_json = json.dumps(story)
|
data = {}
|
||||||
|
data.update(story)
|
||||||
|
|
||||||
story.pop('text', None)
|
meta = {}
|
||||||
story.pop('comments', None)
|
meta.update(story)
|
||||||
meta_json = json.dumps(story)
|
meta.pop('text', None)
|
||||||
|
meta.pop('comments', None)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
session = Session()
|
session = Session()
|
||||||
s = Story(
|
s = Story(
|
||||||
sid=story['id'],
|
sid=story['id'],
|
||||||
ref=story['ref'],
|
ref=story['ref'],
|
||||||
full_json=full_json,
|
data=data,
|
||||||
meta_json=meta_json,
|
meta=meta,
|
||||||
title=story.get('title', None),
|
title=story.get('title', None),
|
||||||
)
|
)
|
||||||
session.merge(s)
|
session.merge(s)
|
||||||
|
@ -70,10 +73,10 @@ def get_reflist(amount):
|
||||||
|
|
||||||
def get_stories(amount):
|
def get_stories(amount):
|
||||||
session = Session()
|
session = Session()
|
||||||
q = session.query(Reflist, Story.meta_json).\
|
q = session.query(Reflist, Story.meta).\
|
||||||
order_by(Reflist.rid.desc()).\
|
|
||||||
join(Story).\
|
join(Story).\
|
||||||
filter(Story.title != None).\
|
filter(Story.title != None).\
|
||||||
|
order_by(Story.meta['date'].desc()).\
|
||||||
limit(amount)
|
limit(amount)
|
||||||
return [x[1] for x in q]
|
return [x[1] for x in q]
|
||||||
|
|
||||||
|
|
|
@ -9,22 +9,23 @@ from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
from feeds import hackernews, reddit, tildes, substack, manual, news
|
from feeds import hackernews, reddit, tildes, substack, manual, news
|
||||||
|
from scrapers import outline, declutter, local
|
||||||
|
|
||||||
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
ONE_HOUR = 60*60
|
||||||
READ_API = 'http://127.0.0.1:33843'
|
ONE_DAY = 24*ONE_HOUR
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||||
TWO_DAYS = 60*60*24*2
|
MAX_AGE_IN_DAYS = 3*ONE_DAY
|
||||||
|
|
||||||
substacks = {}
|
substacks = {}
|
||||||
for key, value in settings.SUBSTACK.items():
|
for key, value in settings.SUBSTACK.items():
|
||||||
substacks[key] = substack.Publication(value['url'])
|
substacks[key] = substack.Publication(value['url'])
|
||||||
categories = {}
|
categories = {}
|
||||||
for key, value in settings.CATEGORY.items():
|
for key, value in settings.CATEGORY.items():
|
||||||
categories[key] = news.Category(value['url'])
|
categories[key] = news.Category(value['url'], value.get('tz'))
|
||||||
sitemaps = {}
|
sitemaps = {}
|
||||||
for key, value in settings.SITEMAP.items():
|
for key, value in settings.SITEMAP.items():
|
||||||
sitemaps[key] = news.Sitemap(value['url'])
|
sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
|
||||||
|
|
||||||
def list():
|
def list():
|
||||||
feed = []
|
feed = []
|
||||||
|
@ -45,53 +46,49 @@ def list():
|
||||||
feed += [(x, key) for x in publication.feed()[:count]]
|
feed += [(x, key) for x in publication.feed()[:count]]
|
||||||
|
|
||||||
for key, sites in categories.items():
|
for key, sites in categories.items():
|
||||||
count = settings.CATEGORY[key]['count']
|
count = settings.CATEGORY[key].get('count') or 0
|
||||||
feed += [(x, key) for x in sites.feed()[:count]]
|
excludes = settings.CATEGORY[key].get('excludes')
|
||||||
|
tz = settings.CATEGORY[key].get('tz')
|
||||||
|
feed += [(x, key) for x in sites.feed(excludes)[:count]]
|
||||||
|
|
||||||
for key, sites in sitemaps.items():
|
for key, sites in sitemaps.items():
|
||||||
count = settings.SITEMAP[key]['count']
|
count = settings.SITEMAP[key].get('count') or 0
|
||||||
feed += [(x, key) for x in sites.feed()[:count]]
|
excludes = settings.SITEMAP[key].get('excludes')
|
||||||
|
feed += [(x, key) for x in sites.feed(excludes)[:count]]
|
||||||
|
|
||||||
|
|
||||||
return feed
|
return feed
|
||||||
|
|
||||||
def get_article(url):
|
def get_article(url):
|
||||||
|
scrapers = {
|
||||||
|
'declutter': declutter,
|
||||||
|
'outline': outline,
|
||||||
|
'local': local,
|
||||||
|
}
|
||||||
|
available = settings.SCRAPERS or ['local']
|
||||||
|
if 'local' not in available:
|
||||||
|
available += ['local']
|
||||||
|
|
||||||
|
for scraper in available:
|
||||||
|
if scraper not in scrapers.keys():
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
params = {'source_url': url}
|
html = scrapers[scraper].get_html(url)
|
||||||
headers = {'Referer': 'https://outline.com/'}
|
if html:
|
||||||
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
|
|
||||||
if r.status_code == 429:
|
|
||||||
logging.info('Rate limited by outline, sleeping 30s and skipping...')
|
|
||||||
time.sleep(30)
|
|
||||||
return ''
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
html = r.json()['data']['html']
|
|
||||||
if 'URL is not supported by Outline' in html:
|
|
||||||
raise Exception('URL not supported by Outline')
|
|
||||||
return html
|
return html
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
except BaseException as e:
|
except:
|
||||||
logging.error('Problem outlining article: {}'.format(str(e)))
|
pass
|
||||||
|
|
||||||
logging.info('Trying our server instead...')
|
|
||||||
|
|
||||||
try:
|
|
||||||
r = requests.post(READ_API, data=dict(url=url), timeout=20)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.text
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem getting article: {}'.format(str(e)))
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def get_content_type(url):
|
def get_content_type(url):
|
||||||
try:
|
try:
|
||||||
headers = {'User-Agent': 'Twitterbot/1.0'}
|
headers = {
|
||||||
return requests.get(url, headers=headers, timeout=2).headers['content-type']
|
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||||
|
'X-Forwarded-For': '66.249.66.1',
|
||||||
|
}
|
||||||
|
return requests.get(url, headers=headers, timeout=5).headers['content-type']
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -127,7 +124,7 @@ def update_story(story, is_manual=False):
|
||||||
logging.info('Story not ready yet')
|
logging.info('Story not ready yet')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
|
if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
|
||||||
logging.info('Story too old, removing')
|
logging.info('Story too old, removing')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
@ -10,29 +10,27 @@ if __name__ == '__main__':
|
||||||
import requests
|
import requests
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from scrapers import declutter
|
||||||
|
import dateutil.parser
|
||||||
import extruct
|
import extruct
|
||||||
|
import pytz
|
||||||
|
|
||||||
from utils import clean
|
from utils import clean
|
||||||
|
|
||||||
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
|
||||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
|
||||||
|
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||||
|
|
||||||
def unix(date_str):
|
def unix(date_str, tz=None):
|
||||||
date_tzfix = date_str
|
|
||||||
if ":" == date_tzfix[-3]:
|
|
||||||
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
|
|
||||||
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
|
|
||||||
for f in formats:
|
|
||||||
try:
|
try:
|
||||||
return int(datetime.strptime(date_str, f).timestamp())
|
dt = dateutil.parser.parse(date_str)
|
||||||
except:
|
if tz:
|
||||||
pass
|
dt = pytz.timezone(tz).localize(dt)
|
||||||
try:
|
return int(dt.timestamp())
|
||||||
return int(datetime.strptime(date_tzfix, f).timestamp())
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def xml(route, ref=None):
|
def xml(route, ref=None):
|
||||||
try:
|
try:
|
||||||
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
|
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
|
||||||
|
@ -46,6 +44,7 @@ def xml(route, ref=None):
|
||||||
logging.error('Problem hitting URL: {}'.format(str(e)))
|
logging.error('Problem hitting URL: {}'.format(str(e)))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def parse_extruct(s, data):
|
def parse_extruct(s, data):
|
||||||
for rdfa in data['rdfa']:
|
for rdfa in data['rdfa']:
|
||||||
for key, props in rdfa.items():
|
for key, props in rdfa.items():
|
||||||
|
@ -54,22 +53,19 @@ def parse_extruct(s, data):
|
||||||
s['title'] = values['@value']
|
s['title'] = values['@value']
|
||||||
if 'http://ogp.me/ns/article#modified_time' in props:
|
if 'http://ogp.me/ns/article#modified_time' in props:
|
||||||
for values in props['http://ogp.me/ns/article#modified_time']:
|
for values in props['http://ogp.me/ns/article#modified_time']:
|
||||||
print(f"modified_time: {values['@value']}")
|
s['date'] = values['@value']
|
||||||
s['date'] = unix(values['@value'])
|
|
||||||
if 'http://ogp.me/ns/article#published_time' in props:
|
if 'http://ogp.me/ns/article#published_time' in props:
|
||||||
for values in props['http://ogp.me/ns/article#published_time']:
|
for values in props['http://ogp.me/ns/article#published_time']:
|
||||||
print(f"published_time: {values['@value']}")
|
s['date'] = values['@value']
|
||||||
s['date'] = unix(values['@value'])
|
|
||||||
|
|
||||||
for og in data['opengraph']:
|
for og in data['opengraph']:
|
||||||
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
|
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
|
||||||
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
|
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
|
||||||
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
|
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
|
||||||
if len(modified):
|
if len(modified):
|
||||||
s['date'] = unix(modified[0])
|
s['date'] = modified[0]
|
||||||
if len(published):
|
if len(published):
|
||||||
s['date'] = unix(published[0])
|
s['date'] = published[0]
|
||||||
s['date'] = unix(published[0] or modified[0] or '')
|
|
||||||
if len(titles):
|
if len(titles):
|
||||||
s['title'] = titles[0]
|
s['title'] = titles[0]
|
||||||
|
|
||||||
|
@ -78,35 +74,56 @@ def parse_extruct(s, data):
|
||||||
props = md['properties']
|
props = md['properties']
|
||||||
s['title'] = props['headline']
|
s['title'] = props['headline']
|
||||||
if props['dateModified']:
|
if props['dateModified']:
|
||||||
s['date'] = unix(props['dateModified'])
|
s['date'] = props['dateModified']
|
||||||
if props['datePublished']:
|
if props['datePublished']:
|
||||||
s['date'] = unix(props['datePublished'])
|
s['date'] = props['datePublished']
|
||||||
if 'author' in props and props['author']:
|
if 'author' in props and props['author']:
|
||||||
s['author'] = props['author']['properties']['name']
|
s['author'] = props['author']['properties']['name']
|
||||||
|
|
||||||
for ld in data['json-ld']:
|
for ld in data['json-ld']:
|
||||||
if ld['@type'] == 'Article':
|
if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
|
||||||
s['title'] = ld['headline']
|
s['title'] = ld['headline']
|
||||||
if ld['dateModified']:
|
if ld['dateModified']:
|
||||||
s['date'] = unix(ld['dateModified'])
|
s['date'] = ld['dateModified']
|
||||||
if ld['datePublished']:
|
if ld['datePublished']:
|
||||||
s['date'] = unix(ld['datePublished'])
|
s['date'] = ld['datePublished']
|
||||||
if 'author' in ld and ld['author']:
|
if 'author' in ld and ld['author']:
|
||||||
s['author'] = ld['author']['name']
|
s['author'] = ld['author']['name']
|
||||||
|
if '@graph' in ld:
|
||||||
|
for gld in ld['@graph']:
|
||||||
|
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
|
||||||
|
s['title'] = gld['headline']
|
||||||
|
if gld['dateModified']:
|
||||||
|
s['date'] = gld['dateModified']
|
||||||
|
if gld['datePublished']:
|
||||||
|
s['date'] = gld['datePublished']
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
class Sitemap:
|
def comment(i):
|
||||||
def __init__(self, url):
|
if 'author' not in i:
|
||||||
self.sitemap_url = url
|
return False
|
||||||
|
|
||||||
def feed(self):
|
c = {}
|
||||||
markup = xml(lambda x: self.sitemap_url)
|
c['author'] = i.get('author', '')
|
||||||
if not markup: return []
|
c['score'] = i.get('points', 0)
|
||||||
soup = BeautifulSoup(markup, features='lxml')
|
c['date'] = unix(i.get('date', 0))
|
||||||
articles = soup.find('urlset').findAll('url')
|
c['text'] = clean(i.get('text', '') or '')
|
||||||
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
|
c['comments'] = [comment(j) for j in i['children']]
|
||||||
return [x.find('loc').text for x in articles] or []
|
c['comments'] = list(filter(bool, c['comments']))
|
||||||
|
return c
|
||||||
|
|
||||||
|
def comment_count(i):
|
||||||
|
alive = 1 if i['author'] else 0
|
||||||
|
return sum([comment_count(c) for c in i['comments']]) + alive
|
||||||
|
|
||||||
|
class _Base:
|
||||||
|
def __init__(url, tz=None):
|
||||||
|
self.url = url
|
||||||
|
self.tz = tz
|
||||||
|
|
||||||
|
def feed(self, excludes=None):
|
||||||
|
return []
|
||||||
|
|
||||||
def story(self, ref):
|
def story(self, ref):
|
||||||
markup = xml(lambda x: ref)
|
markup = xml(lambda x: ref)
|
||||||
|
@ -124,14 +141,58 @@ class Sitemap:
|
||||||
|
|
||||||
data = extruct.extract(markup)
|
data = extruct.extract(markup)
|
||||||
s = parse_extruct(s, data)
|
s = parse_extruct(s, data)
|
||||||
|
if s['date']:
|
||||||
|
s['date'] = unix(s['date'], tz=self.tz)
|
||||||
|
|
||||||
|
if 'disqus' in markup:
|
||||||
|
try:
|
||||||
|
s['comments'] = declutter.get_comments(ref)
|
||||||
|
c['comments'] = list(filter(bool, c['comments']))
|
||||||
|
s['num_comments'] = comment_count(s['comments'])
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not s['date']:
|
||||||
|
return False
|
||||||
return s
|
return s
|
||||||
|
|
||||||
class Category:
|
def get_sitemap_date(a):
|
||||||
def __init__(self, url):
|
if a.find('lastmod'):
|
||||||
|
return a.find('lastmod').text
|
||||||
|
if a.find('news:publication_date'):
|
||||||
|
return a.find('news:publication_date').text
|
||||||
|
return ''
|
||||||
|
|
||||||
|
class Sitemap(_Base):
|
||||||
|
def __init__(self, url, tz=None):
|
||||||
|
self.tz = tz
|
||||||
|
self.sitemap_url = url
|
||||||
|
|
||||||
|
def feed(self, excludes=None):
|
||||||
|
markup = xml(lambda x: self.sitemap_url)
|
||||||
|
if not markup: return []
|
||||||
|
soup = BeautifulSoup(markup, features='lxml')
|
||||||
|
sitemap = soup.find('urlset').findAll('url')
|
||||||
|
|
||||||
|
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
|
||||||
|
links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
|
||||||
|
links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
|
||||||
|
links = [x.find('loc').text for x in links] or []
|
||||||
|
links = list(set(links))
|
||||||
|
if excludes:
|
||||||
|
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
class Category(_Base):
|
||||||
|
def __init__(self, url, tz=None):
|
||||||
|
self.tz = tz
|
||||||
self.category_url = url
|
self.category_url = url
|
||||||
self.base_url = '/'.join(url.split('/')[:3])
|
self.base_url = '/'.join(url.split('/')[:3])
|
||||||
|
|
||||||
def feed(self):
|
def feed(self, excludes=None):
|
||||||
markup = xml(lambda x: self.category_url)
|
markup = xml(lambda x: self.category_url)
|
||||||
if not markup: return []
|
if not markup: return []
|
||||||
soup = BeautifulSoup(markup, features='html.parser')
|
soup = BeautifulSoup(markup, features='html.parser')
|
||||||
|
@ -139,42 +200,30 @@ class Category:
|
||||||
links = [link.get('href') for link in links]
|
links = [link.get('href') for link in links]
|
||||||
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
|
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
|
||||||
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
|
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
|
||||||
|
links = list(filter(None, [link if link != self.category_url else None for link in links]))
|
||||||
|
links = list(set(links))
|
||||||
|
if excludes:
|
||||||
|
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def story(self, ref):
|
|
||||||
markup = xml(lambda x: ref)
|
|
||||||
if not markup:
|
|
||||||
return False
|
|
||||||
|
|
||||||
s = {}
|
|
||||||
s['author_link'] = ''
|
|
||||||
s['score'] = 0
|
|
||||||
s['comments'] = []
|
|
||||||
s['num_comments'] = 0
|
|
||||||
s['link'] = ref
|
|
||||||
s['url'] = ref
|
|
||||||
s['date'] = 0
|
|
||||||
|
|
||||||
data = extruct.extract(markup)
|
|
||||||
s = parse_extruct(s, data)
|
|
||||||
return s
|
|
||||||
|
|
||||||
# scratchpad so I can quickly develop the parser
|
# scratchpad so I can quickly develop the parser
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print("Sitemap: Stuff")
|
print("Sitemap: Stuff")
|
||||||
site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
|
site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml")
|
||||||
posts = site.feed()
|
posts = site.feed()
|
||||||
print(posts[:1])
|
print(posts[:5])
|
||||||
print(site.story(posts[0]))
|
|
||||||
|
|
||||||
print("Sitemap: NZ Herald")
|
|
||||||
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
|
|
||||||
posts = site.feed()
|
|
||||||
print(posts[:1])
|
|
||||||
print(site.story(posts[0]))
|
print(site.story(posts[0]))
|
||||||
|
|
||||||
print("Category: RadioNZ Te Ao Māori")
|
print("Category: RadioNZ Te Ao Māori")
|
||||||
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
|
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
|
||||||
posts = site.feed()
|
posts = site.feed()
|
||||||
print(posts[:1])
|
print(posts[:5])
|
||||||
print(site.story(posts[0]))
|
print(site.story(posts[0]))
|
||||||
|
|
||||||
|
print("Sitemap: Newsroom")
|
||||||
|
site = Sitemap("https://www.newsroom.co.nz/sitemap.xml")
|
||||||
|
posts = site.feed()
|
||||||
|
print(posts[:5])
|
||||||
|
print(site.story(posts[0]))
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ from datetime import datetime
|
||||||
|
|
||||||
from utils import clean
|
from utils import clean
|
||||||
|
|
||||||
|
SUBSTACK_REFERER = 'https://substack.com'
|
||||||
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
|
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
|
||||||
|
|
||||||
def author_link(author_id, base_url):
|
def author_link(author_id, base_url):
|
||||||
|
@ -24,9 +25,10 @@ def api_stories(x, base_url):
|
||||||
def unix(date_str):
|
def unix(date_str):
|
||||||
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
|
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
|
||||||
|
|
||||||
def api(route, ref=None):
|
def api(route, ref=None, referer=None):
|
||||||
|
headers = {'Referer': referer} if referer else None
|
||||||
try:
|
try:
|
||||||
r = requests.get(route(ref), timeout=5)
|
r = requests.get(route(ref), headers=headers, timeout=10)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.json()
|
return r.json()
|
||||||
|
@ -36,7 +38,7 @@ def api(route, ref=None):
|
||||||
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
|
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(route(ref), timeout=15)
|
r = requests.get(route(ref), headers=headers, timeout=20)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.json()
|
return r.json()
|
||||||
|
@ -65,12 +67,14 @@ class Publication:
|
||||||
self.BASE_DOMAIN = domain
|
self.BASE_DOMAIN = domain
|
||||||
|
|
||||||
def feed(self):
|
def feed(self):
|
||||||
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
|
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
|
||||||
|
if not stories: return []
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
return [str(i.get("id")) for i in stories or []]
|
return [str(i.get("id")) for i in stories or []]
|
||||||
|
|
||||||
def story(self, ref):
|
def story(self, ref):
|
||||||
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
|
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
|
||||||
|
if not stories: return False
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
||||||
|
|
||||||
|
@ -90,7 +94,7 @@ class Publication:
|
||||||
s['title'] = r.get('title', '')
|
s['title'] = r.get('title', '')
|
||||||
s['link'] = r.get('canonical_url', '')
|
s['link'] = r.get('canonical_url', '')
|
||||||
s['url'] = r.get('canonical_url', '')
|
s['url'] = r.get('canonical_url', '')
|
||||||
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'))
|
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
|
||||||
s['comments'] = [comment(i) for i in comments.get('comments')]
|
s['comments'] = [comment(i) for i in comments.get('comments')]
|
||||||
s['comments'] = list(filter(bool, s['comments']))
|
s['comments'] = list(filter(bool, s['comments']))
|
||||||
s['num_comments'] = r.get('comment_count', 0)
|
s['num_comments'] = r.get('comment_count', 0)
|
||||||
|
@ -113,12 +117,14 @@ class Publication:
|
||||||
|
|
||||||
class Top:
|
class Top:
|
||||||
def feed(self):
|
def feed(self):
|
||||||
stories = api(SUBSTACK_API_TOP_POSTS)
|
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
|
||||||
|
if not stories: return []
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
return [str(i.get("id")) for i in stories or []]
|
return [str(i.get("id")) for i in stories or []]
|
||||||
|
|
||||||
def story(self, ref):
|
def story(self, ref):
|
||||||
stories = api(SUBSTACK_API_TOP_POSTS)
|
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
|
||||||
|
if not stories: return False
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
||||||
|
|
||||||
|
@ -140,7 +146,7 @@ class Top:
|
||||||
s['title'] = r.get('title', '')
|
s['title'] = r.get('title', '')
|
||||||
s['link'] = r.get('canonical_url', '')
|
s['link'] = r.get('canonical_url', '')
|
||||||
s['url'] = r.get('canonical_url', '')
|
s['url'] = r.get('canonical_url', '')
|
||||||
comments = api(lambda x: api_comments(x, base_url), r.get('id'))
|
comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
|
||||||
s['comments'] = [comment(i) for i in comments.get('comments')]
|
s['comments'] = [comment(i) for i in comments.get('comments')]
|
||||||
s['comments'] = list(filter(bool, s['comments']))
|
s['comments'] = list(filter(bool, s['comments']))
|
||||||
s['num_comments'] = r.get('comment_count', 0)
|
s['num_comments'] = r.get('comment_count', 0)
|
||||||
|
@ -156,5 +162,4 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
webworm = Publication("https://www.webworm.co/")
|
webworm = Publication("https://www.webworm.co/")
|
||||||
posts = webworm.feed()
|
posts = webworm.feed()
|
||||||
print(posts[:1])
|
|
||||||
print(webworm.story(posts[0]))
|
print(webworm.story(posts[0]))
|
||||||
|
|
|
@ -18,6 +18,7 @@ packaging==20.4
|
||||||
praw==6.4.0
|
praw==6.4.0
|
||||||
prawcore==1.4.0
|
prawcore==1.4.0
|
||||||
pyparsing==2.4.7
|
pyparsing==2.4.7
|
||||||
|
pytz==2020.4
|
||||||
requests==2.24.0
|
requests==2.24.0
|
||||||
six==1.15.0
|
six==1.15.0
|
||||||
soupsieve==2.0.1
|
soupsieve==2.0.1
|
||||||
|
@ -29,3 +30,4 @@ websocket-client==0.57.0
|
||||||
Werkzeug==1.0.1
|
Werkzeug==1.0.1
|
||||||
zope.event==4.4
|
zope.event==4.4
|
||||||
zope.interface==5.1.0
|
zope.interface==5.1.0
|
||||||
|
python-dateutil==2.8.1
|
||||||
|
|
41
apiserver/scrapers/declutter.py
Normal file
41
apiserver/scrapers/declutter.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging.DEBUG)
|
||||||
|
import requests
|
||||||
|
|
||||||
|
DECLUTTER_API = 'https://declutter.1j.nz/details'
|
||||||
|
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
|
||||||
|
TIMEOUT = 30
|
||||||
|
|
||||||
|
|
||||||
|
def get_html(url):
|
||||||
|
logging.info(f"Declutter Scraper: {url}")
|
||||||
|
details = get_details(url)
|
||||||
|
if not details:
|
||||||
|
return ''
|
||||||
|
return details['content']
|
||||||
|
|
||||||
|
def get_details(url):
|
||||||
|
try:
|
||||||
|
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem decluttering article: {}'.format(str(e)))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_comments(url):
|
||||||
|
try:
|
||||||
|
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem getting comments for article: {}'.format(str(e)))
|
||||||
|
return None
|
27
apiserver/scrapers/local.py
Normal file
27
apiserver/scrapers/local.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging.DEBUG)
|
||||||
|
import requests
|
||||||
|
|
||||||
|
READ_API = 'http://127.0.0.1:33843/details'
|
||||||
|
TIMEOUT = 20
|
||||||
|
|
||||||
|
def get_html(url):
|
||||||
|
logging.info(f"Local Scraper: {url}")
|
||||||
|
details = get_details(url)
|
||||||
|
if not details:
|
||||||
|
return ''
|
||||||
|
return details['content']
|
||||||
|
|
||||||
|
def get_details(url):
|
||||||
|
try:
|
||||||
|
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem getting article: {}'.format(str(e)))
|
||||||
|
return None
|
37
apiserver/scrapers/outline.py
Normal file
37
apiserver/scrapers/outline.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging.DEBUG)
|
||||||
|
import requests
|
||||||
|
|
||||||
|
OUTLINE_REFERER = 'https://outline.com/'
|
||||||
|
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
||||||
|
TIMEOUT = 20
|
||||||
|
|
||||||
|
def get_html(url):
|
||||||
|
details = get_details(url)
|
||||||
|
if not details:
|
||||||
|
return ''
|
||||||
|
return details['html']
|
||||||
|
|
||||||
|
def get_details(url):
|
||||||
|
try:
|
||||||
|
logging.info(f"Outline Scraper: {url}")
|
||||||
|
params = {'source_url': url}
|
||||||
|
headers = {'Referer': OUTLINE_REFERER}
|
||||||
|
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
|
||||||
|
if r.status_code == 429:
|
||||||
|
logging.info('Rate limited by outline, sleeping 30s and skipping...')
|
||||||
|
time.sleep(30)
|
||||||
|
return None
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
data = r.json()['data']
|
||||||
|
if 'URL is not supported by Outline' in data['html']:
|
||||||
|
raise Exception('URL not supported by Outline')
|
||||||
|
return data
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem outlining article: {}'.format(str(e)))
|
||||||
|
return None
|
|
@ -39,10 +39,7 @@ def update_attributes():
|
||||||
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
|
r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
|
||||||
if r.status_code != 202:
|
if r.status_code != 202:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.json()
|
requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
|
||||||
r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
|
|
||||||
if r.status_code != 202:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
return r.json()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -43,8 +43,7 @@ cors = CORS(flask_app)
|
||||||
@flask_app.route('/api')
|
@flask_app.route('/api')
|
||||||
def api():
|
def api():
|
||||||
stories = database.get_stories(FEED_LENGTH)
|
stories = database.get_stories(FEED_LENGTH)
|
||||||
# hacky nested json
|
res = Response(json.dumps({"stories": stories}))
|
||||||
res = Response('{"stories":[' + ','.join(stories) + ']}')
|
|
||||||
res.headers['content-type'] = 'application/json'
|
res.headers['content-type'] = 'application/json'
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
@ -102,8 +101,7 @@ def submit():
|
||||||
def story(sid):
|
def story(sid):
|
||||||
story = database.get_story(sid)
|
story = database.get_story(sid)
|
||||||
if story:
|
if story:
|
||||||
# hacky nested json
|
res = Response(json.dumps({"story": story.data}))
|
||||||
res = Response('{"story":' + story.full_json + '}')
|
|
||||||
res.headers['content-type'] = 'application/json'
|
res.headers['content-type'] = 'application/json'
|
||||||
return res
|
return res
|
||||||
else:
|
else:
|
||||||
|
@ -127,7 +125,7 @@ def static_story(sid):
|
||||||
|
|
||||||
story = database.get_story(sid)
|
story = database.get_story(sid)
|
||||||
if not story: return abort(404)
|
if not story: return abort(404)
|
||||||
story = json.loads(story.full_json)
|
story = story.data
|
||||||
|
|
||||||
score = story['score']
|
score = story['score']
|
||||||
num_comments = story['num_comments']
|
num_comments = story['num_comments']
|
||||||
|
@ -170,8 +168,7 @@ def feed_thread():
|
||||||
item = ref_list[news_index]
|
item = ref_list[news_index]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
story_json = database.get_story(item['sid']).full_json
|
story = database.get_story(item['sid']).data
|
||||||
story = json.loads(story_json)
|
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
|
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
|
||||||
|
|
||||||
|
|
|
@ -9,19 +9,18 @@ NUM_REDDIT = 10
|
||||||
NUM_TILDES = 5
|
NUM_TILDES = 5
|
||||||
NUM_SUBSTACK = 10
|
NUM_SUBSTACK = 10
|
||||||
|
|
||||||
# SITEMAP = {
|
SITEMAP = {}
|
||||||
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
|
# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
|
||||||
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
|
# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
|
||||||
# }
|
|
||||||
|
|
||||||
# SUBSTACK = {
|
SUBSTACK = {}
|
||||||
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10},
|
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
|
||||||
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
|
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
|
||||||
# }
|
|
||||||
|
|
||||||
# CATEGORY = {
|
CATEGORY = {}
|
||||||
# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
|
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
|
||||||
# }
|
|
||||||
|
SCRAPERS = ['declutter', 'outline', 'local']
|
||||||
|
|
||||||
# Reddit account info
|
# Reddit account info
|
||||||
# leave blank if not using Reddit
|
# leave blank if not using Reddit
|
||||||
|
|
|
@ -1,52 +1,14 @@
|
||||||
|
const port = 33843;
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
const app = express();
|
const app = express();
|
||||||
const port = 33843;
|
const simple = require('./simple');
|
||||||
|
|
||||||
const request = require('request');
|
|
||||||
const JSDOM = require('jsdom').JSDOM;
|
|
||||||
const { Readability } = require('readability');
|
|
||||||
|
|
||||||
app.use(express.urlencoded({ extended: true }));
|
app.use(express.urlencoded({ extended: true }));
|
||||||
|
app.get('/', (req, res) => res.send(simple.FORM));
|
||||||
app.get('/', (req, res) => {
|
app.post('/', (req, res) => simple.scrape(req, res));
|
||||||
res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>');
|
app.post('/details', (req, res) => simple.details(req, res));
|
||||||
});
|
// app.post('/browser', (req, res) => browser.scrape(req, res));
|
||||||
|
// app.post('/browser/details', (req, res) => browser.details(req, res));
|
||||||
const requestCallback = (url, res) => (error, response, body) => {
|
|
||||||
if (!error && response.statusCode == 200) {
|
|
||||||
console.log('Response OK.');
|
|
||||||
|
|
||||||
const doc = new JSDOM(body, {url: url});
|
|
||||||
const reader = new Readability(doc.window.document);
|
|
||||||
const article = reader.parse();
|
|
||||||
|
|
||||||
if (article && article.content) {
|
|
||||||
res.send(article.content);
|
|
||||||
} else {
|
|
||||||
res.sendStatus(404);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.log('Response error:', error ? error.toString() : response.statusCode);
|
|
||||||
res.sendStatus(response ? response.statusCode : 404);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
app.post('/', (req, res) => {
|
|
||||||
const url = req.body.url;
|
|
||||||
const requestOptions = {
|
|
||||||
url: url,
|
|
||||||
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
|
|
||||||
//headers: {'User-Agent': 'Twitterbot/1.0'},
|
|
||||||
headers: {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
|
||||||
'X-Forwarded-For': '66.249.66.1',
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
console.log('Parse request for:', url);
|
|
||||||
|
|
||||||
request(requestOptions, requestCallback(url, res));
|
|
||||||
});
|
|
||||||
|
|
||||||
app.listen(port, () => {
|
app.listen(port, () => {
|
||||||
console.log(`Example app listening on port ${port}!`);
|
console.log(`Example app listening on port ${port}!`);
|
||||||
|
|
43
readerserver/simple.js
Normal file
43
readerserver/simple.js
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
const request = require('request');
|
||||||
|
const JSDOM = require('jsdom').JSDOM;
|
||||||
|
const { Readability } = require('readability');
|
||||||
|
|
||||||
|
const options = url => ({
|
||||||
|
url: url,
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
|
||||||
|
'X-Forwarded-For': '66.249.66.1',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const extract = (url, body) => {
|
||||||
|
const doc = new JSDOM(body, { url: url });
|
||||||
|
const reader = new Readability(doc.window.document);
|
||||||
|
return reader.parse();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
module.exports.FORM = '<form method="POST" action="/" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>';
|
||||||
|
module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => {
|
||||||
|
if (error || response.statusCode != 200) {
|
||||||
|
console.log('Response error:', error ? error.toString() : response.statusCode);
|
||||||
|
return res.sendStatus(response ? response.statusCode : 404);
|
||||||
|
}
|
||||||
|
const article = extract(url, body);
|
||||||
|
if (article && article.content) {
|
||||||
|
return res.send(article.content);
|
||||||
|
}
|
||||||
|
return res.sendStatus(404);
|
||||||
|
});
|
||||||
|
|
||||||
|
module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => {
|
||||||
|
if (error || response.statusCode != 200) {
|
||||||
|
console.log('Response error:', error ? error.toString() : response.statusCode);
|
||||||
|
return res.sendStatus(response ? response.statusCode : 404);
|
||||||
|
}
|
||||||
|
const article = extract(url, body);
|
||||||
|
if (article) {
|
||||||
|
return res.send(article);
|
||||||
|
}
|
||||||
|
return res.sendStatus(404);
|
||||||
|
});
|
|
@ -87,9 +87,12 @@ class Article extends React.Component {
|
||||||
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
|
{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
|
||||||
{' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
|
{' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
|
||||||
|
|
||||||
{hidden || hasChildren &&
|
{hasChildren && (
|
||||||
|
hidden ?
|
||||||
|
<span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
|
||||||
|
:
|
||||||
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}>–</span>
|
<span className='collapser pointer' onClick={() => this.collapseComment(cid)}>–</span>
|
||||||
}
|
)}
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
@ -50,10 +50,6 @@ class Feed extends React.Component {
|
||||||
const stories = this.state.stories;
|
const stories = this.state.stories;
|
||||||
const error = this.state.error;
|
const error = this.state.error;
|
||||||
|
|
||||||
if (stories) {
|
|
||||||
stories.sort((a, b) => b.date - a.date);
|
|
||||||
}
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className='container'>
|
<div className='container'>
|
||||||
<Helmet>
|
<Helmet>
|
||||||
|
@ -62,15 +58,15 @@ class Feed extends React.Component {
|
||||||
{error && <p>Connection error?</p>}
|
{error && <p>Connection error?</p>}
|
||||||
{stories ?
|
{stories ?
|
||||||
<div>
|
<div>
|
||||||
{stories.map((x, i) =>
|
{stories.map(x =>
|
||||||
<div className='item' key={i}>
|
<div className='item' key={x.id}>
|
||||||
<div className='title'>
|
<div className='title'>
|
||||||
<Link className='link' to={'/' + x.id}>
|
<Link className='link' to={'/' + x.id}>
|
||||||
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
|
<img className='source-logo' src={logos[x.source] || logos[x.source.split(' ')[0]]} alt='source logo' /> {x.title}
|
||||||
</Link>
|
</Link>
|
||||||
|
|
||||||
<span className='source'>
|
<span className='source'>
|
||||||
​({sourceLink(x)})
|
({sourceLink(x)})
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
@ -64,15 +64,15 @@ class Results extends React.Component {
|
||||||
<p>Search results:</p>
|
<p>Search results:</p>
|
||||||
<div className='comment lined'>
|
<div className='comment lined'>
|
||||||
{stories.length ?
|
{stories.length ?
|
||||||
stories.map((x, i) =>
|
stories.map(x =>
|
||||||
<div className='item' key={i}>
|
<div className='item' key={x.id}>
|
||||||
<div className='title'>
|
<div className='title'>
|
||||||
<Link className='link' to={'/' + x.id}>
|
<Link className='link' to={'/' + x.id}>
|
||||||
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
|
<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
|
||||||
</Link>
|
</Link>
|
||||||
|
|
||||||
<span className='source'>
|
<span className='source'>
|
||||||
​({sourceLink(x)})
|
({sourceLink(x)})
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user