Merge remote-tracking branch 'tanner/master'

use json type in sqlite.
fix search.
2020-11-09 16:08:28 +13:00 · 2020-11-09 15:45:10 +13:00 · 2020-11-09 15:44:44 +13:00 · 2020-11-09 01:36:51 +00:00 · 2020-11-06 11:20:34 +13:00 · 2020-11-06 10:37:43 +13:00
17 changed files with 376 additions and 216 deletions
@@ -4,6 +4,7 @@ from sqlalchemy import create_engine, Column, String, ForeignKey, Integer
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.exc import IntegrityError
+from sqlalchemy.types import JSON

 engine = create_engine('sqlite:///data/qotnews.sqlite')
 Session = sessionmaker(bind=engine)
@@ -15,8 +16,8 @@ class Story(Base):

    sid = Column(String(16), primary_key=True)
    ref = Column(String(16), unique=True)
-    meta_json = Column(String)
-    full_json = Column(String)
+    meta = Column(JSON)
+    data = Column(JSON)
    title = Column(String)

 class Reflist(Base):
@@ -36,19 +37,21 @@ def get_story(sid):

 def put_story(story):
    story = story.copy()
-    full_json = json.dumps(story)
+    data = {}
+    data.update(story)

-    story.pop('text', None)
-    story.pop('comments', None)
-    meta_json = json.dumps(story)
+    meta = {}
+    meta.update(story)
+    meta.pop('text', None)
+    meta.pop('comments', None)

    try:
        session = Session()
        s = Story(
            sid=story['id'],
            ref=story['ref'],
-            full_json=full_json,
-            meta_json=meta_json,
+            data=data,
+            meta=meta,
            title=story.get('title', None),
        )
        session.merge(s)
@@ -70,10 +73,10 @@ def get_reflist(amount):

 def get_stories(amount):
    session = Session()
-    q = session.query(Reflist, Story.meta_json).\
-            order_by(Reflist.rid.desc()).\
+    q = session.query(Reflist, Story.meta).\
            join(Story).\
            filter(Story.title != None).\
+            order_by(Story.meta['date'].desc()).\
            limit(amount)
    return [x[1] for x in q]

@@ -9,22 +9,23 @@ from bs4 import BeautifulSoup

 import settings
 from feeds import hackernews, reddit, tildes, substack, manual, news
+from scrapers import outline, declutter, local

-OUTLINE_API = 'https://api.outline.com/v3/parse_article'
-READ_API = 'http://127.0.0.1:33843'
+ONE_HOUR = 60*60
+ONE_DAY = 24*ONE_HOUR

 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
-TWO_DAYS = 60*60*24*2
+MAX_AGE_IN_DAYS = 3*ONE_DAY

 substacks = {}
 for key, value in settings.SUBSTACK.items():
    substacks[key] = substack.Publication(value['url'])
 categories = {}
 for key, value in settings.CATEGORY.items():
-    categories[key] = news.Category(value['url'])
+    categories[key] = news.Category(value['url'], value.get('tz'))
 sitemaps = {}
 for key, value in settings.SITEMAP.items():
-    sitemaps[key] = news.Sitemap(value['url'])
+    sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))

 def list():
    feed = []
@@ -45,53 +46,49 @@ def list():
        feed += [(x, key) for x in publication.feed()[:count]]

    for key, sites in categories.items():
-        count = settings.CATEGORY[key]['count']
-        feed += [(x, key) for x in sites.feed()[:count]]
+        count = settings.CATEGORY[key].get('count') or 0
+        excludes = settings.CATEGORY[key].get('excludes')
+        tz = settings.CATEGORY[key].get('tz')
+        feed += [(x, key) for x in sites.feed(excludes)[:count]]

    for key, sites in sitemaps.items():
-        count = settings.SITEMAP[key]['count']
-        feed += [(x, key) for x in sites.feed()[:count]]
+        count = settings.SITEMAP[key].get('count') or 0
+        excludes = settings.SITEMAP[key].get('excludes')
+        feed += [(x, key) for x in sites.feed(excludes)[:count]]


    return feed

 def get_article(url):
+    scrapers = {
+        'declutter': declutter,
+        'outline': outline,
+        'local': local,
+    }
+    available = settings.SCRAPERS or ['local']
+    if 'local' not in available:
+        available += ['local']
+
+    for scraper in available:
+        if scraper not in scrapers.keys():
+            continue
        try:
-        params = {'source_url': url}
-        headers = {'Referer': 'https://outline.com/'}
-        r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
-        if r.status_code == 429:
-            logging.info('Rate limited by outline, sleeping 30s and skipping...')
-            time.sleep(30)
-            return ''
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        html = r.json()['data']['html']
-        if 'URL is not supported by Outline' in html:
-            raise Exception('URL not supported by Outline')
+            html = scrapers[scraper].get_html(url)
+            if html:
                return html
        except KeyboardInterrupt:
            raise
-    except BaseException as e:
-        logging.error('Problem outlining article: {}'.format(str(e)))
-
-    logging.info('Trying our server instead...')
-
-    try:
-        r = requests.post(READ_API, data=dict(url=url), timeout=20)
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        return r.text
-    except KeyboardInterrupt:
-        raise
-    except BaseException as e:
-        logging.error('Problem getting article: {}'.format(str(e)))
+        except:
+            pass
    return ''

 def get_content_type(url):
    try:
-        headers = {'User-Agent': 'Twitterbot/1.0'}
-        return requests.get(url, headers=headers, timeout=2).headers['content-type']
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
+            'X-Forwarded-For': '66.249.66.1',
+        }
+        return requests.get(url, headers=headers, timeout=5).headers['content-type']
    except:
        pass

@@ -127,7 +124,7 @@ def update_story(story, is_manual=False):
        logging.info('Story not ready yet')
        return False

-    if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time():
+    if story['date'] and not is_manual and story['date'] + MAX_AGE_IN_DAYS < time.time():
        logging.info('Story too old, removing')
        return False

@@ -10,29 +10,27 @@ if __name__ == '__main__':
 import requests
 from datetime import datetime
 from bs4 import BeautifulSoup
+from scrapers import declutter
+import dateutil.parser
 import extruct
+import pytz

 from utils import clean

-OUTLINE_API = 'https://api.outline.com/v3/parse_article'
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
+#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

-def unix(date_str):
-    date_tzfix = date_str
-    if ":" == date_tzfix[-3]:
-        date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
-    formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
-    for f in formats:
+def unix(date_str, tz=None):
    try:
-            return int(datetime.strptime(date_str, f).timestamp())
-        except:
-            pass
-        try:
-            return int(datetime.strptime(date_tzfix, f).timestamp())
+        dt = dateutil.parser.parse(date_str)
+        if tz:
+            dt = pytz.timezone(tz).localize(dt)
+        return int(dt.timestamp())
    except:
        pass
    return 0

+
 def xml(route, ref=None):
    try:
        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
@@ -46,6 +44,7 @@ def xml(route, ref=None):
        logging.error('Problem hitting URL: {}'.format(str(e)))
        return False

+
 def parse_extruct(s, data):
    for rdfa in data['rdfa']:
            for key, props in rdfa.items():
@@ -54,22 +53,19 @@ def parse_extruct(s, data):
                        s['title'] = values['@value']
                if 'http://ogp.me/ns/article#modified_time' in props:
                    for values in props['http://ogp.me/ns/article#modified_time']:
-                        print(f"modified_time: {values['@value']}")
-                        s['date'] = unix(values['@value'])
+                        s['date'] = values['@value']
                if 'http://ogp.me/ns/article#published_time' in props:
                    for values in props['http://ogp.me/ns/article#published_time']:
-                        print(f"published_time: {values['@value']}")
-                        s['date'] = unix(values['@value'])
+                        s['date'] = values['@value']

    for og in data['opengraph']:
        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
        if len(modified):
-            s['date'] = unix(modified[0])
+            s['date'] = modified[0]
        if len(published):
-            s['date'] = unix(published[0])
-            s['date'] = unix(published[0] or modified[0] or '')
+            s['date'] = published[0]
        if len(titles):
            s['title'] = titles[0]

@@ -78,35 +74,56 @@ def parse_extruct(s, data):
            props = md['properties']
            s['title'] = props['headline']
            if props['dateModified']:
-                s['date'] = unix(props['dateModified'])
+                s['date'] = props['dateModified']
            if props['datePublished']:
-                s['date'] = unix(props['datePublished'])
+                s['date'] = props['datePublished']
            if 'author' in props and props['author']:
                s['author'] = props['author']['properties']['name']

    for ld in data['json-ld']:
-        if ld['@type'] == 'Article':
+        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
            s['title'] = ld['headline']
            if ld['dateModified']:
-                s['date'] = unix(ld['dateModified'])
+                s['date'] = ld['dateModified']
            if ld['datePublished']:
-                s['date'] = unix(ld['datePublished'])
+                s['date'] = ld['datePublished']
            if 'author' in ld and ld['author']:
                s['author'] = ld['author']['name']
+        if '@graph' in ld:
+            for gld in ld['@graph']:
+                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
+                    s['title'] = gld['headline']
+                    if gld['dateModified']:
+                        s['date'] = gld['dateModified']
+                    if gld['datePublished']:
+                        s['date'] = gld['datePublished']

    return s

-class Sitemap:
-    def __init__(self, url):
-        self.sitemap_url = url
+def comment(i):
+    if 'author' not in i:
+        return False

-    def feed(self):
-        markup = xml(lambda x: self.sitemap_url)
-        if not markup: return []
-        soup = BeautifulSoup(markup, features='lxml')
-        articles = soup.find('urlset').findAll('url')
-        articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
-        return [x.find('loc').text for x in articles] or []
+    c = {}
+    c['author'] = i.get('author', '')
+    c['score'] = i.get('points', 0)
+    c['date'] = unix(i.get('date', 0))
+    c['text'] = clean(i.get('text', '') or '')
+    c['comments'] = [comment(j) for j in i['children']]
+    c['comments'] = list(filter(bool, c['comments']))
+    return c
+
+def comment_count(i):
+    alive = 1 if i['author'] else 0
+    return sum([comment_count(c) for c in i['comments']]) + alive
+
+class _Base:
+    def __init__(url, tz=None):
+        self.url = url
+        self.tz = tz
+
+    def feed(self, excludes=None):
+        return []

    def story(self, ref):
        markup = xml(lambda x: ref)
@@ -124,14 +141,58 @@ class Sitemap:

        data = extruct.extract(markup)
        s = parse_extruct(s, data)
+        if s['date']:
+            s['date'] = unix(s['date'], tz=self.tz)
+
+        if 'disqus' in markup:
+            try:
+                s['comments'] = declutter.get_comments(ref)
+                c['comments'] = list(filter(bool, c['comments']))
+                s['num_comments'] = comment_count(s['comments'])
+            except KeyboardInterrupt:
+                raise
+            except:
+                pass
+
+        if not s['date']:
+            return False
        return s

-class Category:
-    def __init__(self, url):
+def get_sitemap_date(a):
+    if a.find('lastmod'):
+        return a.find('lastmod').text
+    if a.find('news:publication_date'):
+        return a.find('news:publication_date').text
+    return ''
+
+class Sitemap(_Base):
+    def __init__(self, url, tz=None):
+        self.tz = tz
+        self.sitemap_url = url
+
+    def feed(self, excludes=None):
+        markup = xml(lambda x: self.sitemap_url)
+        if not markup: return []
+        soup = BeautifulSoup(markup, features='lxml')
+        sitemap = soup.find('urlset').findAll('url')
+
+        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
+        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
+        links = [x.find('loc').text for x in links] or []
+        links = list(set(links))
+        if excludes:
+            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+        return links
+
+
+class Category(_Base):
+    def __init__(self, url, tz=None):
+        self.tz = tz
        self.category_url = url
        self.base_url = '/'.join(url.split('/')[:3])

-    def feed(self):
+    def feed(self, excludes=None):
        markup = xml(lambda x: self.category_url)
        if not markup: return []
        soup = BeautifulSoup(markup, features='html.parser')
@@ -139,42 +200,30 @@ class Category:
        links = [link.get('href') for link in links]
        links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
        links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
+        links = list(filter(None, [link if link != self.category_url else None for link in links]))
+        links = list(set(links))
+        if excludes:
+            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
        return links

-    def story(self, ref):
-        markup = xml(lambda x: ref)
-        if not markup:
-            return False
-
-        s = {}
-        s['author_link'] = ''
-        s['score'] = 0
-        s['comments'] = []
-        s['num_comments'] = 0
-        s['link'] = ref
-        s['url'] = ref
-        s['date'] = 0
-
-        data = extruct.extract(markup)
-        s = parse_extruct(s, data)
-        return s

 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
    print("Sitemap: Stuff")
-    site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
+    site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml")
    posts = site.feed()
-    print(posts[:1])
-    print(site.story(posts[0]))
-
-    print("Sitemap: NZ Herald")
-    site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
-    posts = site.feed()
-    print(posts[:1])
+    print(posts[:5])
    print(site.story(posts[0]))

    print("Category: RadioNZ Te Ao Māori")
    site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
    posts = site.feed()
-    print(posts[:1])
+    print(posts[:5])
    print(site.story(posts[0]))
+
+    print("Sitemap: Newsroom")
+    site = Sitemap("https://www.newsroom.co.nz/sitemap.xml")
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+
@@ -12,6 +12,7 @@ from datetime import datetime

 from utils import clean

+SUBSTACK_REFERER = 'https://substack.com'
 SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"

 def author_link(author_id, base_url):
@@ -24,9 +25,10 @@ def api_stories(x, base_url):
 def unix(date_str):
    return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())

-def api(route, ref=None):
+def api(route, ref=None, referer=None):
+    headers = {'Referer': referer} if referer else None
    try:
-        r = requests.get(route(ref), timeout=5)
+        r = requests.get(route(ref), headers=headers, timeout=10)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
@@ -36,7 +38,7 @@ def api(route, ref=None):
        logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))

    try:
-        r = requests.get(route(ref), timeout=15)
+        r = requests.get(route(ref), headers=headers, timeout=20)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
@@ -65,12 +67,14 @@ class Publication:
        self.BASE_DOMAIN = domain

    def feed(self):
-        stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
+        stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
+        if not stories: return []
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        return [str(i.get("id")) for i in stories or []]

    def story(self, ref):
-        stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
+        stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
+        if not stories: return False
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))

@@ -90,7 +94,7 @@ class Publication:
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
-        comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'))
+        comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
        s['comments'] = [comment(i) for i in comments.get('comments')]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)
@@ -113,12 +117,14 @@ class Publication:

 class Top:
    def feed(self):
-        stories = api(SUBSTACK_API_TOP_POSTS)
+        stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
+        if not stories: return []
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        return [str(i.get("id")) for i in stories or []]

    def story(self, ref):
-        stories = api(SUBSTACK_API_TOP_POSTS)
+        stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
+        if not stories: return False
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))

@@ -140,7 +146,7 @@ class Top:
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
-        comments = api(lambda x: api_comments(x, base_url), r.get('id'))
+        comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
        s['comments'] = [comment(i) for i in comments.get('comments')]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)
@@ -156,5 +162,4 @@ if __name__ == '__main__':

    webworm = Publication("https://www.webworm.co/")
    posts = webworm.feed()
-    print(posts[:1])
    print(webworm.story(posts[0]))
@@ -18,6 +18,7 @@ packaging==20.4
 praw==6.4.0
 prawcore==1.4.0
 pyparsing==2.4.7
+pytz==2020.4
 requests==2.24.0
 six==1.15.0
 soupsieve==2.0.1
@@ -29,3 +30,4 @@ websocket-client==0.57.0
 Werkzeug==1.0.1
 zope.event==4.4
 zope.interface==5.1.0
+python-dateutil==2.8.1
@@ -0,0 +1,41 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+import requests
+
+DECLUTTER_API = 'https://declutter.1j.nz/details'
+DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
+TIMEOUT = 30
+
+
+def get_html(url):
+    logging.info(f"Declutter Scraper: {url}")
+    details = get_details(url)
+    if not details:
+        return ''
+    return details['content']
+
+def get_details(url):
+    try:
+        r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem decluttering article: {}'.format(str(e)))
+        return None
+
+def get_comments(url):
+    try:
+        r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem getting comments for article: {}'.format(str(e)))
+        return None
@@ -0,0 +1,27 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+import requests
+
+READ_API = 'http://127.0.0.1:33843/details'
+TIMEOUT = 20
+
+def get_html(url):
+    logging.info(f"Local Scraper: {url}")
+    details = get_details(url)
+    if not details:
+        return ''
+    return details['content']
+
+def get_details(url):
+    try:
+        r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem getting article: {}'.format(str(e)))
+        return None
@@ -0,0 +1,37 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+import requests
+
+OUTLINE_REFERER = 'https://outline.com/'
+OUTLINE_API = 'https://api.outline.com/v3/parse_article'
+TIMEOUT = 20
+
+def get_html(url):
+    details = get_details(url)
+    if not details:
+        return ''
+    return details['html']
+
+def get_details(url):
+    try:
+        logging.info(f"Outline Scraper: {url}")
+        params = {'source_url': url}
+        headers = {'Referer': OUTLINE_REFERER}
+        r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
+        if r.status_code == 429:
+            logging.info('Rate limited by outline, sleeping 30s and skipping...')
+            time.sleep(30)
+            return None
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        data = r.json()['data']
+        if 'URL is not supported by Outline' in data['html']:
+            raise Exception('URL not supported by Outline')
+        return data
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem outlining article: {}'.format(str(e)))
+        return None
@@ -39,10 +39,7 @@ def update_attributes():
        r = requests.post(MEILI_URL + 'indexes/qotnews/settings/searchable-attributes', json=json, timeout=2)
        if r.status_code != 202:
            raise Exception('Bad response code ' + str(r.status_code))
-        return r.json()
-        r = requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
-        if r.status_code != 202:
-            raise Exception('Bad response code ' + str(r.status_code))
+        requests.delete(MEILI_URL + 'indexes/qotnews/settings/displayed-attributes', timeout=2)
        return r.json()
    except KeyboardInterrupt:
        raise
@@ -43,8 +43,7 @@ cors = CORS(flask_app)
@flask_app.route('/api')
 def api():
    stories = database.get_stories(FEED_LENGTH)
-    # hacky nested json
-    res = Response('{"stories":[' + ','.join(stories) + ']}')
+    res = Response(json.dumps({"stories": stories}))
    res.headers['content-type'] = 'application/json'
    return res

@@ -102,8 +101,7 @@ def submit():
 def story(sid):
    story = database.get_story(sid)
    if story:
-        # hacky nested json
-        res = Response('{"story":' + story.full_json + '}')
+        res = Response(json.dumps({"story": story.data}))
        res.headers['content-type'] = 'application/json'
        return res
    else:
@@ -127,7 +125,7 @@ def static_story(sid):

    story = database.get_story(sid)
    if not story: return abort(404)
-    story = json.loads(story.full_json)
+    story = story.data

    score = story['score']
    num_comments = story['num_comments']
@@ -170,8 +168,7 @@ def feed_thread():
                item = ref_list[news_index]

                try:
-                    story_json = database.get_story(item['sid']).full_json
-                    story = json.loads(story_json)
+                    story = database.get_story(item['sid']).data
                except AttributeError:
                    story = dict(id=item['sid'], ref=item['ref'], source=item['source'])

@@ -9,19 +9,18 @@ NUM_REDDIT = 10
 NUM_TILDES = 5
 NUM_SUBSTACK = 10

-# SITEMAP = {
-#     'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
-#     'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
-# }
+SITEMAP = {}
+# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
+# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},

-# SUBSTACK = {
-#     'webworm': { 'url': "https://www.webworm.co", 'count': 10},
-#     'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
-# }
+SUBSTACK = {}
+# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
+# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},

-# CATEGORY = {
-#     'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
-# }
+CATEGORY = {}
+# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
+
+SCRAPERS = ['declutter', 'outline', 'local']

 # Reddit account info
 # leave blank if not using Reddit
@@ -1,52 +1,14 @@
+const port = 33843;
 const express = require('express');
 const app = express();
-const port = 33843;
-
-const request = require('request');
-const JSDOM = require('jsdom').JSDOM;
-const { Readability } = require('readability');
+const simple = require('./simple');

 app.use(express.urlencoded({ extended: true }));
-
-app.get('/', (req, res) => {
-	res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>');
-});
-
-const requestCallback = (url, res) => (error, response, body) => {
-	if (!error && response.statusCode == 200) {
-		console.log('Response OK.');
-
-		const doc = new JSDOM(body, {url: url});
-		const reader = new Readability(doc.window.document);
-		const article = reader.parse();
-
-		if (article && article.content) {
-			res.send(article.content);
-		} else {
-			res.sendStatus(404);
-		}
-	} else {
-		console.log('Response error:', error ? error.toString() : response.statusCode);
-		res.sendStatus(response ? response.statusCode : 404);
-	}
-};
-
-app.post('/', (req, res) => {
-	const url = req.body.url;
-	const requestOptions = {
-		url: url,
-		//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
-		//headers: {'User-Agent': 'Twitterbot/1.0'},
-		headers: {
-			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
-			'X-Forwarded-For': '66.249.66.1',
-		},
-	};
-
-	console.log('Parse request for:', url);
-
-	request(requestOptions, requestCallback(url, res));
-});
+app.get('/', (req, res) => res.send(simple.FORM));
+app.post('/', (req, res) => simple.scrape(req, res));
+app.post('/details', (req, res) => simple.details(req, res));
+// app.post('/browser', (req, res) => browser.scrape(req, res));
+// app.post('/browser/details', (req, res) => browser.details(req, res));

 app.listen(port, () => {
 	console.log(`Example app listening on port ${port}!`);
@@ -0,0 +1,43 @@
+const request = require('request');
+const JSDOM = require('jsdom').JSDOM;
+const { Readability } = require('readability');
+
+const options = url => ({
+	url: url,
+	headers: {
+		'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
+		'X-Forwarded-For': '66.249.66.1',
+	},
+});
+
+const extract = (url, body) => {
+	const doc = new JSDOM(body, { url: url });
+	const reader = new Readability(doc.window.document);
+	return reader.parse();
+};
+
+
+module.exports.FORM = '<form method="POST" action="/" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>';
+module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => {
+	if (error || response.statusCode != 200) {
+		console.log('Response error:', error ? error.toString() : response.statusCode);
+		return res.sendStatus(response ? response.statusCode : 404);
+	}
+	const article = extract(url, body);
+	if (article && article.content) {
+		return res.send(article.content);
+	}
+	return res.sendStatus(404);
+});
+
+module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => {
+	if (error || response.statusCode != 200) {
+		console.log('Response error:', error ? error.toString() : response.statusCode);
+		return res.sendStatus(response ? response.statusCode : 404);
+	}
+	const article = extract(url, body);
+	if (article) {
+		return res.send(article);
+	}
+	return res.sendStatus(404);
+});
@@ -72,7 +72,7 @@ class Article extends React.Component {
 	}

 	displayComment(story, c, level) {
-		const cid = c.author+c.date;
+		const cid = c.author + c.date;

 		const collapsed = this.state.collapsed.includes(cid);
 		const expanded = this.state.expanded.includes(cid);
@@ -85,18 +85,21 @@ class Article extends React.Component {
 				<div className='info'>
 					<p>
 						{c.author === story.author ? '[OP]' : ''} {c.author || '[Deleted]'}
-						{' '} | <HashLink to={'#'+cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>
+						{' '} | <HashLink to={'#' + cid} id={cid}>{moment.unix(c.date).fromNow()}</HashLink>

-						{hidden || hasChildren &&
+						{hasChildren && (
+							hidden ?
+								<span className='collapser expander pointer' onClick={() => this.expandComment(cid)}>+</span>
+								:
 								<span className='collapser pointer' onClick={() => this.collapseComment(cid)}>–</span>
-						}
+						)}
 					</p>
 				</div>

 				<div className={collapsed ? 'text hidden' : 'text'} dangerouslySetInnerHTML={{ __html: c.text }} />

 				{hidden && hasChildren ?
-					<div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c)-1} more]</div>
+					<div className='comment lined info pointer' onClick={() => this.expandComment(cid)}>[show {this.countComments(c) - 1} more]</div>
 					:
 					c.comments.map(i => this.displayComment(story, i, level + 1))
 				}
@@ -50,10 +50,6 @@ class Feed extends React.Component {
 		const stories = this.state.stories;
 		const error = this.state.error;

-		if (stories) {
-			stories.sort((a, b) => b.date - a.date);
-		}
-
 		return (
 			<div className='container'>
 				<Helmet>
@@ -62,15 +58,15 @@ class Feed extends React.Component {
 				{error && <p>Connection error?</p>}
 				{stories ?
 					<div>
-						{stories.map((x, i) =>
-							<div className='item' key={i}>
+						{stories.map(x =>
+							<div className='item' key={x.id}>
 								<div className='title'>
 									<Link className='link' to={'/' + x.id}>
-										<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
+										<img className='source-logo' src={logos[x.source] || logos[x.source.split(' ')[0]]} alt='source logo' /> {x.title}
 									</Link>

 									<span className='source'>
-										&#8203;({sourceLink(x)})
+										({sourceLink(x)})
 									</span>
 								</div>

@@ -64,15 +64,15 @@ class Results extends React.Component {
 						<p>Search results:</p>
 						<div className='comment lined'>
 							{stories.length ?
-								stories.map((x, i) =>
-									<div className='item' key={i}>
+								stories.map(x =>
+									<div className='item' key={x.id}>
 										<div className='title'>
 											<Link className='link' to={'/' + x.id}>
 												<img className='source-logo' src={logos[x.source]} alt='source logo' /> {x.title}
 											</Link>

 											<span className='source'>
-												&#8203;({sourceLink(x)})
+												({sourceLink(x)})
 											</span>
 										</div>
Author	SHA1	Message	Date
Jason Schwarzenberger	bfa4108a8e	Merge remote-tracking branch 'tanner/master'	2020-11-09 16:08:28 +13:00
Jason Schwarzenberger	0bd0d40a31	use json type in sqlite.	2020-11-09 15:45:10 +13:00
Jason Schwarzenberger	4e04595415	fix search.	2020-11-09 15:44:44 +13:00
Jason	006db2960c	change to 3 days	2020-11-09 01:36:51 +00:00
Jason Schwarzenberger	1f063f0dac	undo log level change	2020-11-06 11:20:34 +13:00
Jason Schwarzenberger	1658346aa9	fix news.py feed.	2020-11-06 10:37:43 +13:00
Jason Schwarzenberger	2dbc702b40	switch to python-dateutil for parser, reverse sort xml feeds.	2020-11-06 10:02:39 +13:00
Jason Schwarzenberger	1c4764e67d	sort sitemap feed by lastmod time.	2020-11-06 09:30:15 +13:00
Jason	ee49d2021e	newsroom	2020-11-05 20:28:55 +00:00
Jason	c391c50ab1	use localize	2020-11-05 04:15:31 +00:00
Jason Schwarzenberger	095f0d549a	use replace.	2020-11-05 16:57:08 +13:00
Jason Schwarzenberger	c21c71667e	fix date issue.	2020-11-05 16:41:15 +13:00
Jason Schwarzenberger	c3a2c91a11	update requirements.txt	2020-11-05 16:33:50 +13:00
Jason Schwarzenberger	0f39446a61	tz aware for use in settings.	2020-11-05 16:30:55 +13:00
Jason Schwarzenberger	351059aab1	fix excludes.	2020-11-05 15:59:13 +13:00
Jason Schwarzenberger	4488e2c292	add an `excludes` list of substrings for urls in the settings for sitemap/category.	2020-11-05 15:51:59 +13:00
Jason Schwarzenberger	afda5b635c	disqus test.	2020-11-05 14:23:51 +13:00
Jason Schwarzenberger	0fc1a44d2b	fix issue in substack.	2020-11-04 17:40:29 +13:00
Jason Schwarzenberger	9fff1b9e46	avoid duplicate articles listed on the category page	2020-11-04 17:14:42 +13:00
Jason Schwarzenberger	16b59f6c67	try stop bad pages.	2020-11-04 16:34:31 +13:00
Jason Schwarzenberger	939f4775a7	better settings example.	2020-11-04 15:52:34 +13:00
Jason Schwarzenberger	9bfc6fc6fa	scraper settings, ordering and loop.	2020-11-04 15:47:12 +13:00
Jason Schwarzenberger	6ea9844d00	remove useless try blocks.	2020-11-04 15:37:19 +13:00
Jason Schwarzenberger	1318259d3d	imply referrer is substack.	2020-11-04 15:21:07 +13:00
Jason Schwarzenberger	98a0c2257c	increase declutter timeout.	2020-11-04 15:15:00 +13:00
Jason Schwarzenberger	e6976db25d	fix tabs	2020-11-04 15:04:20 +13:00
Jason Schwarzenberger	9edc8b7cca	move scraping for article content to files.	2020-11-04 15:00:58 +13:00
Jason Schwarzenberger	33e21e7f30	fix mistake.	2020-11-04 12:45:01 +13:00
Jason Schwarzenberger	892a99eca6	add + expander in place of collapser.	2020-11-04 12:43:15 +13:00
Jason Schwarzenberger	d718d05a04	fix dates for newsroom.	2020-11-04 11:53:16 +13:00
Jason Schwarzenberger	d1795eb1b8	add radionz and newsroom logos.	2020-11-04 11:30:56 +13:00
tanner	9a279d44b1	Add header to get content type	2020-11-03 20:27:43 +00:00
tanner	e506804666	Clean code up	2020-11-03 03:45:56 +00:00