forked from tanner/qotnews
		
	Compare commits
	
		
			5 Commits
		
	
	
		
			b80c1a5cb5
			...
			5668fa5dbc
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					5668fa5dbc | ||
| 
						 | 
					b771b52501 | ||
| 
						 | 
					f5c7a658ba | ||
| 
						 | 
					f5ccd844da | ||
| 
						 | 
					6a91b9402f | 
@@ -24,6 +24,7 @@ class Reflist(Base):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    rid = Column(Integer, primary_key=True)
 | 
					    rid = Column(Integer, primary_key=True)
 | 
				
			||||||
    ref = Column(String(16), unique=True)
 | 
					    ref = Column(String(16), unique=True)
 | 
				
			||||||
 | 
					    urlref = Column(String)
 | 
				
			||||||
    sid = Column(String, ForeignKey('stories.sid'), unique=True)
 | 
					    sid = Column(String, ForeignKey('stories.sid'), unique=True)
 | 
				
			||||||
    source = Column(String(16))
 | 
					    source = Column(String(16))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -75,7 +76,7 @@ def get_stories_by_url(url):
 | 
				
			|||||||
def get_reflist():
 | 
					def get_reflist():
 | 
				
			||||||
    session = Session()
 | 
					    session = Session()
 | 
				
			||||||
    q = session.query(Reflist).order_by(Reflist.rid.desc())
 | 
					    q = session.query(Reflist).order_by(Reflist.rid.desc())
 | 
				
			||||||
    return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
 | 
					    return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_stories(maxage=60*60*24*2):
 | 
					def get_stories(maxage=60*60*24*2):
 | 
				
			||||||
    time = datetime.now().timestamp() - maxage
 | 
					    time = datetime.now().timestamp() - maxage
 | 
				
			||||||
@@ -87,10 +88,10 @@ def get_stories(maxage=60*60*24*2):
 | 
				
			|||||||
            order_by(Story.meta['date'].desc())
 | 
					            order_by(Story.meta['date'].desc())
 | 
				
			||||||
    return [x[1] for x in q]
 | 
					    return [x[1] for x in q]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def put_ref(ref, sid, source):
 | 
					def put_ref(ref, sid, source, urlref):
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        session = Session()
 | 
					        session = Session()
 | 
				
			||||||
        r = Reflist(ref=ref, sid=sid, source=source)
 | 
					        r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
 | 
				
			||||||
        session.add(r)
 | 
					        session.add(r)
 | 
				
			||||||
        session.commit()
 | 
					        session.commit()
 | 
				
			||||||
    except:
 | 
					    except:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,7 +9,9 @@ from bs4 import BeautifulSoup
 | 
				
			|||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import settings
 | 
					import settings
 | 
				
			||||||
from feeds import hackernews, reddit, tildes, substack, manual, news
 | 
					from feeds import hackernews, reddit, tildes, substack, manual
 | 
				
			||||||
 | 
					from feeds.sitemap import Sitemap
 | 
				
			||||||
 | 
					from feeds.category import Category
 | 
				
			||||||
from scrapers import outline, declutter, browser, local
 | 
					from scrapers import outline, declutter, browser, local
 | 
				
			||||||
 | 
					
 | 
				
			||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
 | 
					INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
 | 
				
			||||||
@@ -19,40 +21,40 @@ for key, value in settings.SUBSTACK.items():
 | 
				
			|||||||
    substacks[key] = substack.Publication(value['url'])
 | 
					    substacks[key] = substack.Publication(value['url'])
 | 
				
			||||||
categories = {}
 | 
					categories = {}
 | 
				
			||||||
for key, value in settings.CATEGORY.items():
 | 
					for key, value in settings.CATEGORY.items():
 | 
				
			||||||
    categories[key] = news.Category(value['url'], value.get('tz'))
 | 
					    categories[key] = Category(value)
 | 
				
			||||||
sitemaps = {}
 | 
					sitemaps = {}
 | 
				
			||||||
for key, value in settings.SITEMAP.items():
 | 
					for key, value in settings.SITEMAP.items():
 | 
				
			||||||
    sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
 | 
					    sitemaps[key] = Sitemap(value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_list():
 | 
					def get_list():
 | 
				
			||||||
    feeds = {}
 | 
					    feeds = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if settings.NUM_HACKERNEWS:
 | 
					    if settings.NUM_HACKERNEWS:
 | 
				
			||||||
        feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
 | 
					        feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if settings.NUM_REDDIT:
 | 
					    if settings.NUM_REDDIT:
 | 
				
			||||||
        feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
 | 
					        feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if settings.NUM_TILDES:
 | 
					    if settings.NUM_TILDES:
 | 
				
			||||||
        feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
 | 
					        feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if settings.NUM_SUBSTACK:
 | 
					    if settings.NUM_SUBSTACK:
 | 
				
			||||||
        feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
 | 
					        feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for key, publication in substacks.items():
 | 
					    for key, publication in substacks.items():
 | 
				
			||||||
        count = settings.SUBSTACK[key]['count']
 | 
					        count = settings.SUBSTACK[key]['count']
 | 
				
			||||||
        feeds[key] = [(x, key) for x in publication.feed()[:count]]
 | 
					        feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for key, sites in categories.items():
 | 
					    for key, sites in categories.items():
 | 
				
			||||||
        count = settings.CATEGORY[key].get('count') or 0
 | 
					        count = settings.CATEGORY[key].get('count') or 0
 | 
				
			||||||
        excludes = settings.CATEGORY[key].get('excludes')
 | 
					        excludes = settings.CATEGORY[key].get('excludes')
 | 
				
			||||||
        tz = settings.CATEGORY[key].get('tz')
 | 
					        tz = settings.CATEGORY[key].get('tz')
 | 
				
			||||||
        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
 | 
					        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for key, sites in sitemaps.items():
 | 
					    for key, sites in sitemaps.items():
 | 
				
			||||||
        count = settings.SITEMAP[key].get('count') or 0
 | 
					        count = settings.SITEMAP[key].get('count') or 0
 | 
				
			||||||
        excludes = settings.SITEMAP[key].get('excludes')
 | 
					        excludes = settings.SITEMAP[key].get('excludes')
 | 
				
			||||||
        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
 | 
					        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    values = feeds.values()
 | 
					    values = feeds.values()
 | 
				
			||||||
    feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
 | 
					    feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
 | 
				
			||||||
@@ -99,7 +101,7 @@ def get_content_type(url):
 | 
				
			|||||||
    except:
 | 
					    except:
 | 
				
			||||||
        return ''
 | 
					        return ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def update_story(story, is_manual=False):
 | 
					def update_story(story, is_manual=False, urlref=None):
 | 
				
			||||||
    res = {}
 | 
					    res = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if story['source'] == 'hackernews':
 | 
					    if story['source'] == 'hackernews':
 | 
				
			||||||
@@ -111,9 +113,9 @@ def update_story(story, is_manual=False):
 | 
				
			|||||||
    elif story['source'] == 'substack':
 | 
					    elif story['source'] == 'substack':
 | 
				
			||||||
        res = substack.top.story(story['ref'])
 | 
					        res = substack.top.story(story['ref'])
 | 
				
			||||||
    elif story['source'] in categories.keys():
 | 
					    elif story['source'] in categories.keys():
 | 
				
			||||||
        res = categories[story['source']].story(story['ref'])
 | 
					        res = categories[story['source']].story(story['ref'], urlref)
 | 
				
			||||||
    elif story['source'] in sitemaps.keys():
 | 
					    elif story['source'] in sitemaps.keys():
 | 
				
			||||||
        res = sitemaps[story['source']].story(story['ref'])
 | 
					        res = sitemaps[story['source']].story(story['ref'], urlref)
 | 
				
			||||||
    elif story['source'] in substacks.keys():
 | 
					    elif story['source'] in substacks.keys():
 | 
				
			||||||
        res = substacks[story['source']].story(story['ref'])
 | 
					        res = substacks[story['source']].story(story['ref'])
 | 
				
			||||||
    elif story['source'] == 'manual':
 | 
					    elif story['source'] == 'manual':
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										72
									
								
								apiserver/feeds/category.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								apiserver/feeds/category.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,72 @@
 | 
				
			|||||||
 | 
					import logging
 | 
				
			||||||
 | 
					logging.basicConfig(
 | 
				
			||||||
 | 
					        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
				
			||||||
 | 
					        level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    import sys
 | 
				
			||||||
 | 
					    sys.path.insert(0,'.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import settings
 | 
				
			||||||
 | 
					from utils import clean
 | 
				
			||||||
 | 
					from misc.api import xml
 | 
				
			||||||
 | 
					from misc.news import Base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _filter_links(links, category_url, excludes=None):
 | 
				
			||||||
 | 
					    links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
 | 
				
			||||||
 | 
					    links = list(filter(None, [link if link != category_url else None for link in links]))
 | 
				
			||||||
 | 
					    links = list(set(links))
 | 
				
			||||||
 | 
					    if excludes:
 | 
				
			||||||
 | 
					        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
 | 
				
			||||||
 | 
					    return links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_category(category_url, excludes=None):
 | 
				
			||||||
 | 
					    base_url = '/'.join(category_url.split('/')[:3])
 | 
				
			||||||
 | 
					    markup = xml(lambda x: category_url)
 | 
				
			||||||
 | 
					    if not markup: return []
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(markup, features='html.parser')
 | 
				
			||||||
 | 
					    links = soup.find_all('a', href=True)
 | 
				
			||||||
 | 
					    links = [link.get('href') for link in links]
 | 
				
			||||||
 | 
					    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
 | 
				
			||||||
 | 
					    links = _filter_links(links, category_url, excludes)
 | 
				
			||||||
 | 
					    return links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Category(Base):
 | 
				
			||||||
 | 
					    def __init__(self, config):
 | 
				
			||||||
 | 
					        self.config = config
 | 
				
			||||||
 | 
					        self.category_url = config.get('url')
 | 
				
			||||||
 | 
					        self.tz = config.get('tz')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def feed(self, excludes=None):
 | 
				
			||||||
 | 
					        links = []
 | 
				
			||||||
 | 
					        if isinstance(self.category_url, str):
 | 
				
			||||||
 | 
					            links += _get_category(self.category_url, excludes)
 | 
				
			||||||
 | 
					        elif isinstance(self.category_url, list):
 | 
				
			||||||
 | 
					            for url in self.category_url:
 | 
				
			||||||
 | 
					                links += _get_category(url, excludes)
 | 
				
			||||||
 | 
					        links = list(set(links))
 | 
				
			||||||
 | 
					        return [(self.get_id(link), link) for link in links]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# scratchpad so I can quickly develop the parser
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    print("Category: RadioNZ")
 | 
				
			||||||
 | 
					    site = Category("https://www.rnz.co.nz/news/")
 | 
				
			||||||
 | 
					    excludes = [
 | 
				
			||||||
 | 
					        'rnz.co.nz/news/sport',
 | 
				
			||||||
 | 
					        'rnz.co.nz/weather',
 | 
				
			||||||
 | 
					        'rnz.co.nz/news/weather',
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    posts = site.feed(excludes)
 | 
				
			||||||
 | 
					    print(posts[:5])
 | 
				
			||||||
 | 
					    print(site.story(posts[0]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("Category: Newsroom")
 | 
				
			||||||
 | 
					    site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
 | 
				
			||||||
 | 
					    posts = site.feed()
 | 
				
			||||||
 | 
					    print(posts[:5])
 | 
				
			||||||
 | 
					    print(site.story(posts[0]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1,307 +0,0 @@
 | 
				
			|||||||
import logging
 | 
					 | 
				
			||||||
logging.basicConfig(
 | 
					 | 
				
			||||||
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
					 | 
				
			||||||
        level=logging.DEBUG)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    import sys
 | 
					 | 
				
			||||||
    sys.path.insert(0,'.')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import requests
 | 
					 | 
				
			||||||
from datetime import datetime
 | 
					 | 
				
			||||||
from bs4 import BeautifulSoup
 | 
					 | 
				
			||||||
from scrapers import declutter
 | 
					 | 
				
			||||||
import dateutil.parser
 | 
					 | 
				
			||||||
import extruct
 | 
					 | 
				
			||||||
import pytz
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from utils import clean
 | 
					 | 
				
			||||||
import settings
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
tzinfos = {
 | 
					 | 
				
			||||||
    'NZDT': pytz.timezone('Pacific/Auckland'),
 | 
					 | 
				
			||||||
    'NZST': pytz.timezone('Pacific/Auckland')
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
 | 
					 | 
				
			||||||
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def unix(date_str, tz=None):
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
 | 
					 | 
				
			||||||
        if tz:
 | 
					 | 
				
			||||||
            dt = pytz.timezone(tz).localize(dt)
 | 
					 | 
				
			||||||
        return int(dt.timestamp())
 | 
					 | 
				
			||||||
    except:
 | 
					 | 
				
			||||||
        pass
 | 
					 | 
				
			||||||
    return 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def xml(route, ref=None):
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
 | 
					 | 
				
			||||||
        r = requests.get(route(ref), headers=headers, timeout=5)
 | 
					 | 
				
			||||||
        if r.status_code != 200:
 | 
					 | 
				
			||||||
            raise Exception('Bad response code ' + str(r.status_code))
 | 
					 | 
				
			||||||
        return r.text
 | 
					 | 
				
			||||||
    except KeyboardInterrupt:
 | 
					 | 
				
			||||||
        raise
 | 
					 | 
				
			||||||
    except BaseException as e:
 | 
					 | 
				
			||||||
        logging.error('Problem hitting URL: {}'.format(str(e)))
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def parse_extruct(s, data):
 | 
					 | 
				
			||||||
    rdfa_keys = {
 | 
					 | 
				
			||||||
        'title': [
 | 
					 | 
				
			||||||
            'http://ogp.me/ns#title',
 | 
					 | 
				
			||||||
            'https://ogp.me/ns#title',
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
        'date': [
 | 
					 | 
				
			||||||
            'http://ogp.me/ns/article#modified_time',
 | 
					 | 
				
			||||||
            'https://ogp.me/ns/article#modified_time',
 | 
					 | 
				
			||||||
            'http://ogp.me/ns/article#published_time',
 | 
					 | 
				
			||||||
            'https://ogp.me/ns/article#published_time',
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    for rdfa in data['rdfa']:
 | 
					 | 
				
			||||||
        for key, props in rdfa.items():
 | 
					 | 
				
			||||||
            for attribute, properties in rdfa_keys.items():
 | 
					 | 
				
			||||||
                for prop in properties:
 | 
					 | 
				
			||||||
                    if prop in props:
 | 
					 | 
				
			||||||
                        for values in props[prop]:
 | 
					 | 
				
			||||||
                            s[attribute] = values['@value']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for og in data['opengraph']:
 | 
					 | 
				
			||||||
        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
 | 
					 | 
				
			||||||
        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
 | 
					 | 
				
			||||||
        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
 | 
					 | 
				
			||||||
        if len(modified):
 | 
					 | 
				
			||||||
            s['date'] = modified[0]
 | 
					 | 
				
			||||||
        if len(published):
 | 
					 | 
				
			||||||
            s['date'] = published[0]
 | 
					 | 
				
			||||||
        if len(titles):
 | 
					 | 
				
			||||||
            s['title'] = titles[0]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for md in data['microdata']:
 | 
					 | 
				
			||||||
        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
 | 
					 | 
				
			||||||
            props = md['properties']
 | 
					 | 
				
			||||||
            s['title'] = props['headline']
 | 
					 | 
				
			||||||
            if props['dateModified']:
 | 
					 | 
				
			||||||
                s['date'] = props['dateModified']
 | 
					 | 
				
			||||||
            if props['datePublished']:
 | 
					 | 
				
			||||||
                s['date'] = props['datePublished']
 | 
					 | 
				
			||||||
            if 'author' in props and props['author']:
 | 
					 | 
				
			||||||
                if 'properties' in props['author']:
 | 
					 | 
				
			||||||
                    s['author'] = props['author']['properties']['name']
 | 
					 | 
				
			||||||
                elif isinstance(props['author'], list):
 | 
					 | 
				
			||||||
                    s['author'] = props['author'][0]['properties']['name']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for ld in data['json-ld']:
 | 
					 | 
				
			||||||
        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
 | 
					 | 
				
			||||||
            s['title'] = ld['headline']
 | 
					 | 
				
			||||||
            if ld['dateModified']:
 | 
					 | 
				
			||||||
                s['date'] = ld['dateModified']
 | 
					 | 
				
			||||||
            if ld['datePublished']:
 | 
					 | 
				
			||||||
                s['date'] = ld['datePublished']
 | 
					 | 
				
			||||||
            if 'author' in ld and ld['author']:
 | 
					 | 
				
			||||||
                if 'name' in ld['author']:
 | 
					 | 
				
			||||||
                    s['author'] = ld['author']['name']
 | 
					 | 
				
			||||||
                elif isinstance(ld['author'], list):
 | 
					 | 
				
			||||||
                    s['author'] = ld['author'][0]['name']
 | 
					 | 
				
			||||||
        if '@graph' in ld:
 | 
					 | 
				
			||||||
            for gld in ld['@graph']:
 | 
					 | 
				
			||||||
                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
 | 
					 | 
				
			||||||
                    s['title'] = gld['headline']
 | 
					 | 
				
			||||||
                    if gld['dateModified']:
 | 
					 | 
				
			||||||
                        s['date'] = gld['dateModified']
 | 
					 | 
				
			||||||
                    if gld['datePublished']:
 | 
					 | 
				
			||||||
                        s['date'] = gld['datePublished']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return s
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def comment(i):
 | 
					 | 
				
			||||||
    if 'author' not in i:
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    c = {}
 | 
					 | 
				
			||||||
    c['author'] = i.get('author', '')
 | 
					 | 
				
			||||||
    c['score'] = i.get('points', 0)
 | 
					 | 
				
			||||||
    c['date'] = unix(i.get('date', 0))
 | 
					 | 
				
			||||||
    c['text'] = clean(i.get('text', '') or '')
 | 
					 | 
				
			||||||
    c['comments'] = [comment(j) for j in i['children']]
 | 
					 | 
				
			||||||
    c['comments'] = list(filter(bool, c['comments']))
 | 
					 | 
				
			||||||
    return c
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def comment_count(i):
 | 
					 | 
				
			||||||
    alive = 1 if i['author'] else 0
 | 
					 | 
				
			||||||
    return sum([comment_count(c) for c in i['comments']]) + alive
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class _Base:
 | 
					 | 
				
			||||||
    def __init__(url, tz=None):
 | 
					 | 
				
			||||||
        self.url = url
 | 
					 | 
				
			||||||
        self.tz = tz
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def feed(self, excludes=None):
 | 
					 | 
				
			||||||
        return []
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def story(self, ref):
 | 
					 | 
				
			||||||
        markup = xml(lambda x: ref)
 | 
					 | 
				
			||||||
        if not markup:
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        s = {}
 | 
					 | 
				
			||||||
        s['author_link'] = ''
 | 
					 | 
				
			||||||
        s['score'] = 0
 | 
					 | 
				
			||||||
        s['comments'] = []
 | 
					 | 
				
			||||||
        s['num_comments'] = 0
 | 
					 | 
				
			||||||
        s['link'] = ref
 | 
					 | 
				
			||||||
        s['url'] = ref
 | 
					 | 
				
			||||||
        s['date'] = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        soup = BeautifulSoup(markup, features='html.parser')
 | 
					 | 
				
			||||||
        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
 | 
					 | 
				
			||||||
        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
 | 
					 | 
				
			||||||
        favicon = soup.find_all('link', rel="shortcut icon", href=True)
 | 
					 | 
				
			||||||
        others = soup.find_all('link', rel="icon", href=True)
 | 
					 | 
				
			||||||
        icons = icon32 + icon16 + favicon + others
 | 
					 | 
				
			||||||
        base_url = '/'.join(ref.split('/')[:3])
 | 
					 | 
				
			||||||
        icons = list(set([i.get('href') for i in icons]))
 | 
					 | 
				
			||||||
        icons = [i if i.startswith('http') else base_url + i for i in icons]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if icons:
 | 
					 | 
				
			||||||
            s['icon'] = icons[0]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        data = extruct.extract(markup)
 | 
					 | 
				
			||||||
        s = parse_extruct(s, data)
 | 
					 | 
				
			||||||
        if s['date']:
 | 
					 | 
				
			||||||
            s['date'] = unix(s['date'], tz=self.tz)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if 'disqus' in markup:
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                s['comments'] = declutter.get_comments(ref)
 | 
					 | 
				
			||||||
                c['comments'] = list(filter(bool, c['comments']))
 | 
					 | 
				
			||||||
                s['num_comments'] = comment_count(s['comments'])
 | 
					 | 
				
			||||||
            except KeyboardInterrupt:
 | 
					 | 
				
			||||||
                raise
 | 
					 | 
				
			||||||
            except:
 | 
					 | 
				
			||||||
                pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not s['date']:
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
        return s
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_sitemap_date(a):
 | 
					 | 
				
			||||||
    if a.find('lastmod'):
 | 
					 | 
				
			||||||
        return a.find('lastmod').text
 | 
					 | 
				
			||||||
    if a.find('news:publication_date'):
 | 
					 | 
				
			||||||
        return a.find('news:publication_date').text
 | 
					 | 
				
			||||||
    if a.find('ns2:publication_date'):
 | 
					 | 
				
			||||||
        return a.find('ns2:publication_date').text
 | 
					 | 
				
			||||||
    return ''
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class Sitemap(_Base):
 | 
					 | 
				
			||||||
    def __init__(self, url, tz=None):
 | 
					 | 
				
			||||||
        self.tz = tz
 | 
					 | 
				
			||||||
        self.sitemap_url = url
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def feed(self, excludes=None):
 | 
					 | 
				
			||||||
        links = []
 | 
					 | 
				
			||||||
        if isinstance(self.sitemap_url, str):
 | 
					 | 
				
			||||||
            links += self._get_sitemap(self.sitemap_url, excludes)
 | 
					 | 
				
			||||||
        elif isinstance(self.sitemap_url, list):
 | 
					 | 
				
			||||||
            for url in self.sitemap_url:
 | 
					 | 
				
			||||||
                links += self._get_sitemap(url, excludes)
 | 
					 | 
				
			||||||
        return list(set(links))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _filter_links(self, links, excludes=None):
 | 
					 | 
				
			||||||
        too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
 | 
					 | 
				
			||||||
        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
 | 
					 | 
				
			||||||
        links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links]))
 | 
					 | 
				
			||||||
        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        links = [x.find('loc').text for x in links] or []
 | 
					 | 
				
			||||||
        links = list(set(links))
 | 
					 | 
				
			||||||
        if excludes:
 | 
					 | 
				
			||||||
            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
 | 
					 | 
				
			||||||
        return links
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_sitemap(self, feed_url, excludes=None):
 | 
					 | 
				
			||||||
        markup = xml(lambda x: feed_url)
 | 
					 | 
				
			||||||
        if not markup: return []
 | 
					 | 
				
			||||||
        soup = BeautifulSoup(markup, features='lxml')
 | 
					 | 
				
			||||||
        links = []
 | 
					 | 
				
			||||||
        feed_urls = []
 | 
					 | 
				
			||||||
        if soup.find('sitemapindex'):
 | 
					 | 
				
			||||||
            sitemap = soup.find('sitemapindex').findAll('sitemap')
 | 
					 | 
				
			||||||
            feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
 | 
					 | 
				
			||||||
        if soup.find('urlset'):
 | 
					 | 
				
			||||||
            sitemap = soup.find('urlset').findAll('url')
 | 
					 | 
				
			||||||
            links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        feed_urls = self._filter_links(feed_urls, excludes)
 | 
					 | 
				
			||||||
        links = self._filter_links(links, excludes)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        for url in feed_urls:
 | 
					 | 
				
			||||||
            links += self._get_sitemap(url, excludes)
 | 
					 | 
				
			||||||
        return list(set(links))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class Category(_Base):
 | 
					 | 
				
			||||||
    def __init__(self, url, tz=None):
 | 
					 | 
				
			||||||
        self.tz = tz
 | 
					 | 
				
			||||||
        self.category_url = url
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _filter_links(self, links, category_url, excludes=None):
 | 
					 | 
				
			||||||
        links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
 | 
					 | 
				
			||||||
        links = list(filter(None, [link if link != category_url else None for link in links]))
 | 
					 | 
				
			||||||
        links = list(set(links))
 | 
					 | 
				
			||||||
        if excludes:
 | 
					 | 
				
			||||||
            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
 | 
					 | 
				
			||||||
        return links
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_category(self, category_url, excludes=None):
 | 
					 | 
				
			||||||
        base_url = '/'.join(category_url.split('/')[:3])
 | 
					 | 
				
			||||||
        markup = xml(lambda x: category_url)
 | 
					 | 
				
			||||||
        if not markup: return []
 | 
					 | 
				
			||||||
        soup = BeautifulSoup(markup, features='html.parser')
 | 
					 | 
				
			||||||
        links = soup.find_all('a', href=True)
 | 
					 | 
				
			||||||
        links = [link.get('href') for link in links]
 | 
					 | 
				
			||||||
        links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
 | 
					 | 
				
			||||||
        links = self._filter_links(links, category_url, excludes)
 | 
					 | 
				
			||||||
        return links
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def feed(self, excludes=None):
 | 
					 | 
				
			||||||
        links = []
 | 
					 | 
				
			||||||
        if isinstance(self.category_url, str):
 | 
					 | 
				
			||||||
            links += self._get_category(self.category_url, excludes)
 | 
					 | 
				
			||||||
        elif isinstance(self.category_url, list):
 | 
					 | 
				
			||||||
            for url in self.category_url:
 | 
					 | 
				
			||||||
                links += self._get_category(url, excludes)
 | 
					 | 
				
			||||||
        return list(set(links))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# scratchpad so I can quickly develop the parser
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    print("Sitemap: The Spinoff")
 | 
					 | 
				
			||||||
    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
 | 
					 | 
				
			||||||
    excludes = [
 | 
					 | 
				
			||||||
        'thespinoff.co.nz/sitemap-misc.xml',
 | 
					 | 
				
			||||||
        'thespinoff.co.nz/sitemap-authors.xml',
 | 
					 | 
				
			||||||
        'thespinoff.co.nz/sitemap-tax-category.xml',
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    posts = site.feed(excludes)
 | 
					 | 
				
			||||||
    print(posts[:5])
 | 
					 | 
				
			||||||
    print(site.story(posts[0]))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("Sitemap: Newshub")
 | 
					 | 
				
			||||||
    site = Sitemap([
 | 
					 | 
				
			||||||
        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
 | 
					 | 
				
			||||||
        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
 | 
					 | 
				
			||||||
        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
 | 
					 | 
				
			||||||
        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
 | 
					 | 
				
			||||||
    ])
 | 
					 | 
				
			||||||
    posts = site.feed()
 | 
					 | 
				
			||||||
    print(posts[:5])
 | 
					 | 
				
			||||||
    print(site.story(posts[0]))
 | 
					 | 
				
			||||||
    print(site.story(posts[:-1]))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										99
									
								
								apiserver/feeds/sitemap.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										99
									
								
								apiserver/feeds/sitemap.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,99 @@
 | 
				
			|||||||
 | 
					import logging
 | 
				
			||||||
 | 
					logging.basicConfig(
 | 
				
			||||||
 | 
					        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
				
			||||||
 | 
					        level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    import sys
 | 
				
			||||||
 | 
					    sys.path.insert(0,'.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import settings
 | 
				
			||||||
 | 
					from utils import clean
 | 
				
			||||||
 | 
					from misc.time import unix
 | 
				
			||||||
 | 
					from misc.api import xml
 | 
				
			||||||
 | 
					from misc.news import Base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_sitemap_date(a):
 | 
				
			||||||
 | 
					    if a.find('lastmod'):
 | 
				
			||||||
 | 
					        return a.find('lastmod').text
 | 
				
			||||||
 | 
					    if a.find('news:publication_date'):
 | 
				
			||||||
 | 
					        return a.find('news:publication_date').text
 | 
				
			||||||
 | 
					    if a.find('ns2:publication_date'):
 | 
				
			||||||
 | 
					        return a.find('ns2:publication_date').text
 | 
				
			||||||
 | 
					    return ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _filter_links(links, excludes=None):
 | 
				
			||||||
 | 
					    too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
 | 
				
			||||||
 | 
					    links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
 | 
				
			||||||
 | 
					    links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
 | 
				
			||||||
 | 
					    links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    links = [x.find('loc').text for x in links] or []
 | 
				
			||||||
 | 
					    links = list(set(links))
 | 
				
			||||||
 | 
					    if excludes:
 | 
				
			||||||
 | 
					        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
 | 
				
			||||||
 | 
					    return links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_sitemap(feed_url, excludes=None):
 | 
				
			||||||
 | 
					    markup = xml(lambda x: feed_url)
 | 
				
			||||||
 | 
					    if not markup: return []
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(markup, features='lxml')
 | 
				
			||||||
 | 
					    links = []
 | 
				
			||||||
 | 
					    feed_urls = []
 | 
				
			||||||
 | 
					    if soup.find('sitemapindex'):
 | 
				
			||||||
 | 
					        sitemap = soup.find('sitemapindex').findAll('sitemap')
 | 
				
			||||||
 | 
					        feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
 | 
				
			||||||
 | 
					    if soup.find('urlset'):
 | 
				
			||||||
 | 
					        sitemap = soup.find('urlset').findAll('url')
 | 
				
			||||||
 | 
					        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    feed_urls = _filter_links(feed_urls, excludes)
 | 
				
			||||||
 | 
					    links = _filter_links(links, excludes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for url in feed_urls:
 | 
				
			||||||
 | 
					        links += _get_sitemap(url, excludes)
 | 
				
			||||||
 | 
					    return list(set(links))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Sitemap(Base):
 | 
				
			||||||
 | 
					    def __init__(self, config):
 | 
				
			||||||
 | 
					        self.config = config
 | 
				
			||||||
 | 
					        self.sitemap_url = config.get('url')
 | 
				
			||||||
 | 
					        self.tz = config.get('tz')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def feed(self, excludes=None):
 | 
				
			||||||
 | 
					        links = []
 | 
				
			||||||
 | 
					        if isinstance(self.sitemap_url, str):
 | 
				
			||||||
 | 
					            links += _get_sitemap(self.sitemap_url, excludes)
 | 
				
			||||||
 | 
					        elif isinstance(self.sitemap_url, list):
 | 
				
			||||||
 | 
					            for url in self.sitemap_url:
 | 
				
			||||||
 | 
					                links += _get_sitemap(url, excludes)
 | 
				
			||||||
 | 
					        links = list(set(links))
 | 
				
			||||||
 | 
					        return [(self.get_id(link), link) for link in links]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# scratchpad so I can quickly develop the parser
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    print("Sitemap: The Spinoff")
 | 
				
			||||||
 | 
					    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
 | 
				
			||||||
 | 
					    excludes = [
 | 
				
			||||||
 | 
					        'thespinoff.co.nz/sitemap-misc.xml',
 | 
				
			||||||
 | 
					        'thespinoff.co.nz/sitemap-authors.xml',
 | 
				
			||||||
 | 
					        'thespinoff.co.nz/sitemap-tax-category.xml',
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    posts = site.feed(excludes)
 | 
				
			||||||
 | 
					    print(posts[:5])
 | 
				
			||||||
 | 
					    print(site.story(posts[0]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("Sitemap: Newshub")
 | 
				
			||||||
 | 
					    site = Sitemap([
 | 
				
			||||||
 | 
					        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
 | 
				
			||||||
 | 
					        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
 | 
				
			||||||
 | 
					        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
 | 
				
			||||||
 | 
					        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
 | 
				
			||||||
 | 
					    ])
 | 
				
			||||||
 | 
					    posts = site.feed()
 | 
				
			||||||
 | 
					    print(posts[:5])
 | 
				
			||||||
 | 
					    print(site.story(posts[0]))
 | 
				
			||||||
 | 
					    print(site.story(posts[:-1]))
 | 
				
			||||||
							
								
								
									
										35
									
								
								apiserver/misc/api.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								apiserver/misc/api.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,35 @@
 | 
				
			|||||||
 | 
					import logging
 | 
				
			||||||
 | 
					logging.basicConfig(
 | 
				
			||||||
 | 
					        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
				
			||||||
 | 
					        level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
 | 
				
			||||||
 | 
					FORWARD_IP = '66.249.66.1'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def xml(route, ref=None):
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
 | 
				
			||||||
 | 
					        r = requests.get(route(ref), headers=headers, timeout=5)
 | 
				
			||||||
 | 
					        if r.status_code != 200:
 | 
				
			||||||
 | 
					            raise Exception('Bad response code ' + str(r.status_code))
 | 
				
			||||||
 | 
					        return r.text
 | 
				
			||||||
 | 
					    except KeyboardInterrupt:
 | 
				
			||||||
 | 
					        raise
 | 
				
			||||||
 | 
					    except BaseException as e:
 | 
				
			||||||
 | 
					        logging.error('Problem hitting URL: {}'.format(str(e)))
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def json(route, ref=None):
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
 | 
				
			||||||
 | 
					        r = requests.get(route(ref), headers=headers, timeout=5)
 | 
				
			||||||
 | 
					        if r.status_code != 200:
 | 
				
			||||||
 | 
					            raise Exception('Bad response code ' + str(r.status_code))
 | 
				
			||||||
 | 
					        return r.json()
 | 
				
			||||||
 | 
					    except KeyboardInterrupt:
 | 
				
			||||||
 | 
					        raise
 | 
				
			||||||
 | 
					    except BaseException as e:
 | 
				
			||||||
 | 
					        logging.error('Problem hitting URL: {}'.format(str(e)))
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
							
								
								
									
										69
									
								
								apiserver/misc/metadata.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								apiserver/misc/metadata.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,69 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_extruct(s, data):
 | 
				
			||||||
 | 
					    rdfa_keys = {
 | 
				
			||||||
 | 
					        'title': [
 | 
				
			||||||
 | 
					            'http://ogp.me/ns#title',
 | 
				
			||||||
 | 
					            'https://ogp.me/ns#title',
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					        'date': [
 | 
				
			||||||
 | 
					            'http://ogp.me/ns/article#modified_time',
 | 
				
			||||||
 | 
					            'https://ogp.me/ns/article#modified_time',
 | 
				
			||||||
 | 
					            'http://ogp.me/ns/article#published_time',
 | 
				
			||||||
 | 
					            'https://ogp.me/ns/article#published_time',
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    for rdfa in data['rdfa']:
 | 
				
			||||||
 | 
					        for key, props in rdfa.items():
 | 
				
			||||||
 | 
					            for attribute, properties in rdfa_keys.items():
 | 
				
			||||||
 | 
					                for prop in properties:
 | 
				
			||||||
 | 
					                    if prop in props:
 | 
				
			||||||
 | 
					                        for values in props[prop]:
 | 
				
			||||||
 | 
					                            s[attribute] = values['@value']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for og in data['opengraph']:
 | 
				
			||||||
 | 
					        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
 | 
				
			||||||
 | 
					        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
 | 
				
			||||||
 | 
					        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
 | 
				
			||||||
 | 
					        if len(modified):
 | 
				
			||||||
 | 
					            s['date'] = modified[0]
 | 
				
			||||||
 | 
					        if len(published):
 | 
				
			||||||
 | 
					            s['date'] = published[0]
 | 
				
			||||||
 | 
					        if len(titles):
 | 
				
			||||||
 | 
					            s['title'] = titles[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for md in data['microdata']:
 | 
				
			||||||
 | 
					        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
 | 
				
			||||||
 | 
					            props = md['properties']
 | 
				
			||||||
 | 
					            s['title'] = props['headline']
 | 
				
			||||||
 | 
					            if props['dateModified']:
 | 
				
			||||||
 | 
					                s['date'] = props['dateModified']
 | 
				
			||||||
 | 
					            if props['datePublished']:
 | 
				
			||||||
 | 
					                s['date'] = props['datePublished']
 | 
				
			||||||
 | 
					            if 'author' in props and props['author']:
 | 
				
			||||||
 | 
					                if 'properties' in props['author']:
 | 
				
			||||||
 | 
					                    s['author'] = props['author']['properties']['name']
 | 
				
			||||||
 | 
					                elif isinstance(props['author'], list):
 | 
				
			||||||
 | 
					                    s['author'] = props['author'][0]['properties']['name']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for ld in data['json-ld']:
 | 
				
			||||||
 | 
					        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
 | 
				
			||||||
 | 
					            s['title'] = ld['headline']
 | 
				
			||||||
 | 
					            if ld['dateModified']:
 | 
				
			||||||
 | 
					                s['date'] = ld['dateModified']
 | 
				
			||||||
 | 
					            if ld['datePublished']:
 | 
				
			||||||
 | 
					                s['date'] = ld['datePublished']
 | 
				
			||||||
 | 
					            if 'author' in ld and ld['author']:
 | 
				
			||||||
 | 
					                if 'name' in ld['author']:
 | 
				
			||||||
 | 
					                    s['author'] = ld['author']['name']
 | 
				
			||||||
 | 
					                elif isinstance(ld['author'], list):
 | 
				
			||||||
 | 
					                    s['author'] = ld['author'][0]['name']
 | 
				
			||||||
 | 
					        if '@graph' in ld:
 | 
				
			||||||
 | 
					            for gld in ld['@graph']:
 | 
				
			||||||
 | 
					                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
 | 
				
			||||||
 | 
					                    s['title'] = gld['headline']
 | 
				
			||||||
 | 
					                    if gld['dateModified']:
 | 
				
			||||||
 | 
					                        s['date'] = gld['dateModified']
 | 
				
			||||||
 | 
					                    if gld['datePublished']:
 | 
				
			||||||
 | 
					                        s['date'] = gld['datePublished']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return s
 | 
				
			||||||
							
								
								
									
										101
									
								
								apiserver/misc/news.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								apiserver/misc/news.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,101 @@
 | 
				
			|||||||
 | 
					import logging
 | 
				
			||||||
 | 
					logging.basicConfig(
 | 
				
			||||||
 | 
					        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
				
			||||||
 | 
					        level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					from scrapers import declutter
 | 
				
			||||||
 | 
					import extruct
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import settings
 | 
				
			||||||
 | 
					from utils import clean
 | 
				
			||||||
 | 
					from misc.metadata import parse_extruct
 | 
				
			||||||
 | 
					from misc.time import unix
 | 
				
			||||||
 | 
					from misc.api import xml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def comment(i):
 | 
				
			||||||
 | 
					    if 'author' not in i:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    c = {}
 | 
				
			||||||
 | 
					    c['author'] = i.get('author', '')
 | 
				
			||||||
 | 
					    c['score'] = i.get('points', 0)
 | 
				
			||||||
 | 
					    c['date'] = unix(i.get('date', 0))
 | 
				
			||||||
 | 
					    c['text'] = clean(i.get('text', '') or '')
 | 
				
			||||||
 | 
					    c['comments'] = [comment(j) for j in i['children']]
 | 
				
			||||||
 | 
					    c['comments'] = list(filter(bool, c['comments']))
 | 
				
			||||||
 | 
					    return c
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def comment_count(i):
 | 
				
			||||||
 | 
					    alive = 1 if i['author'] else 0
 | 
				
			||||||
 | 
					    return sum([comment_count(c) for c in i['comments']]) + alive
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Base:
 | 
				
			||||||
 | 
					    def __init__(config):
 | 
				
			||||||
 | 
					        self.config = config
 | 
				
			||||||
 | 
					        self.url = config.get('url')
 | 
				
			||||||
 | 
					        self.tz = config.get('tz')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_id(self, link):
 | 
				
			||||||
 | 
					        patterns = self.config.get('patterns')
 | 
				
			||||||
 | 
					        if not patterns:
 | 
				
			||||||
 | 
					            return link
 | 
				
			||||||
 | 
					        patterns = [re.compile(p) for p in patterns]
 | 
				
			||||||
 | 
					        patterns = list(filter(None, [p.match(link) for p in patterns]))
 | 
				
			||||||
 | 
					        patterns = list(set([':'.join(p.groups()) for p in patterns]))
 | 
				
			||||||
 | 
					        if not patterns:
 | 
				
			||||||
 | 
					            return link
 | 
				
			||||||
 | 
					        return patterns[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def feed(self, excludes=None):
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def story(self, ref, urlref):
 | 
				
			||||||
 | 
					        if urlref is None:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					        markup = xml(lambda x: urlref)
 | 
				
			||||||
 | 
					        if not markup:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        s = {}
 | 
				
			||||||
 | 
					        s['author_link'] = ''
 | 
				
			||||||
 | 
					        s['score'] = 0
 | 
				
			||||||
 | 
					        s['comments'] = []
 | 
				
			||||||
 | 
					        s['num_comments'] = 0
 | 
				
			||||||
 | 
					        s['link'] = urlref
 | 
				
			||||||
 | 
					        s['url'] = urlref
 | 
				
			||||||
 | 
					        s['date'] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        soup = BeautifulSoup(markup, features='html.parser')
 | 
				
			||||||
 | 
					        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
 | 
				
			||||||
 | 
					        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
 | 
				
			||||||
 | 
					        favicon = soup.find_all('link', rel="shortcut icon", href=True)
 | 
				
			||||||
 | 
					        others = soup.find_all('link', rel="icon", href=True)
 | 
				
			||||||
 | 
					        icons = icon32 + icon16 + favicon + others
 | 
				
			||||||
 | 
					        base_url = '/'.join(urlref.split('/')[:3])
 | 
				
			||||||
 | 
					        icons = list(set([i.get('href') for i in icons]))
 | 
				
			||||||
 | 
					        icons = [i if i.startswith('http') else base_url + i for i in icons]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if icons:
 | 
				
			||||||
 | 
					            s['icon'] = icons[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        data = extruct.extract(markup)
 | 
				
			||||||
 | 
					        s = parse_extruct(s, data)
 | 
				
			||||||
 | 
					        if s['date']:
 | 
				
			||||||
 | 
					            s['date'] = unix(s['date'], tz=self.tz)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if 'disqus' in markup:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                s['comments'] = declutter.get_comments(urlref)
 | 
				
			||||||
 | 
					                c['comments'] = list(filter(bool, c['comments']))
 | 
				
			||||||
 | 
					                s['num_comments'] = comment_count(s['comments'])
 | 
				
			||||||
 | 
					            except KeyboardInterrupt:
 | 
				
			||||||
 | 
					                raise
 | 
				
			||||||
 | 
					            except:
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not s['date']:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					        return s
 | 
				
			||||||
							
								
								
									
										18
									
								
								apiserver/misc/time.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								apiserver/misc/time.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
				
			|||||||
 | 
					import pytz
 | 
				
			||||||
 | 
					import dateutil.parser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TZINFOS = {
 | 
				
			||||||
 | 
					    'NZDT': pytz.timezone('Pacific/Auckland'),
 | 
				
			||||||
 | 
					    'NZST': pytz.timezone('Pacific/Auckland')
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def unix(date_str, tz=None, tzinfos=TZINFOS):
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
 | 
				
			||||||
 | 
					        if tz:
 | 
				
			||||||
 | 
					            dt = pytz.timezone(tz).localize(dt)
 | 
				
			||||||
 | 
					        return int(dt.timestamp())
 | 
				
			||||||
 | 
					    except:
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					    return 0
 | 
				
			||||||
@@ -145,12 +145,12 @@ def static_story(sid):
 | 
				
			|||||||
http_server = WSGIServer(('', 33842), flask_app)
 | 
					http_server = WSGIServer(('', 33842), flask_app)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _add_new_refs():
 | 
					def _add_new_refs():
 | 
				
			||||||
    for ref, source in feed.get_list():
 | 
					    for ref, source, urlref in feed.get_list():
 | 
				
			||||||
        if database.get_story_by_ref(ref):
 | 
					        if database.get_story_by_ref(ref):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            nid = new_id()
 | 
					            nid = new_id()
 | 
				
			||||||
            database.put_ref(ref, nid, source)
 | 
					            database.put_ref(ref, nid, source, urlref)
 | 
				
			||||||
            logging.info('Added ref ' + ref)
 | 
					            logging.info('Added ref ' + ref)
 | 
				
			||||||
        except database.IntegrityError:
 | 
					        except database.IntegrityError:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
@@ -163,7 +163,7 @@ def _update_current_story(item):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    logging.info('Updating story: {}'.format(str(story['ref'])))
 | 
					    logging.info('Updating story: {}'.format(str(story['ref'])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    valid = feed.update_story(story)
 | 
					    valid = feed.update_story(story, urlref=item['urlref'])
 | 
				
			||||||
    if valid:
 | 
					    if valid:
 | 
				
			||||||
        database.put_story(story)
 | 
					        database.put_story(story)
 | 
				
			||||||
        search.put_story(story)
 | 
					        search.put_story(story)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -13,15 +13,43 @@ NUM_TILDES = 5
 | 
				
			|||||||
NUM_SUBSTACK = 10
 | 
					NUM_SUBSTACK = 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SITEMAP = {}
 | 
					SITEMAP = {}
 | 
				
			||||||
# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
 | 
					# SITEMAP['nzherald'] = {
 | 
				
			||||||
# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
 | 
					#     'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
 | 
				
			||||||
 | 
					#     'count': 20,
 | 
				
			||||||
 | 
					#     'patterns': [
 | 
				
			||||||
 | 
					#         r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
 | 
				
			||||||
 | 
					#     ],
 | 
				
			||||||
 | 
					#     'excludes': [
 | 
				
			||||||
 | 
					#         'driven.co.nz',
 | 
				
			||||||
 | 
					#         'oneroof.co.nz',
 | 
				
			||||||
 | 
					#         'nzherald.co.nz/sponsored-stories',
 | 
				
			||||||
 | 
					#         'nzherald.co.nz/entertainment/',
 | 
				
			||||||
 | 
					#         'nzherald.co.nz/lifestyle/',
 | 
				
			||||||
 | 
					#         'nzherald.co.nz/travel/',
 | 
				
			||||||
 | 
					#         'nzherald.co.nz/sport/',
 | 
				
			||||||
 | 
					#         'nzherald.co.nz/promotions/',
 | 
				
			||||||
 | 
					#         'nzherald.co.nzhttp',
 | 
				
			||||||
 | 
					#         'herald-afternoon-quiz',
 | 
				
			||||||
 | 
					#         'herald-morning-quiz'
 | 
				
			||||||
 | 
					#     ],
 | 
				
			||||||
 | 
					# }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SUBSTACK = {}
 | 
					SUBSTACK = {}
 | 
				
			||||||
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
 | 
					# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
 | 
				
			||||||
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
 | 
					# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CATEGORY = {}
 | 
					CATEGORY = {}
 | 
				
			||||||
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
 | 
					# CATEGORY['radionz'] = {
 | 
				
			||||||
 | 
					#     'url': "https://www.rnz.co.nz/news/",
 | 
				
			||||||
 | 
					#     'count': 20,
 | 
				
			||||||
 | 
					#     'patterns': [
 | 
				
			||||||
 | 
					#         r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
 | 
				
			||||||
 | 
					#     ],
 | 
				
			||||||
 | 
					#     'excludes': [
 | 
				
			||||||
 | 
					#         'rnz.co.nz/news/sport',
 | 
				
			||||||
 | 
					#         'rnz.co.nz/weather',
 | 
				
			||||||
 | 
					#     ],
 | 
				
			||||||
 | 
					# }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
 | 
					SCRAPERS = ['browser', 'declutter', 'outline', 'local']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -39,6 +39,9 @@
 | 
				
			|||||||
	if (matchDomain(["tvnz.co.nz"])) {
 | 
						if (matchDomain(["tvnz.co.nz"])) {
 | 
				
			||||||
		removeSelectors([".signup-container container"]);
 | 
							removeSelectors([".signup-container container"]);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						if (matchDomain(["thespinoff.co.nz"])) {
 | 
				
			||||||
 | 
							removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	function matchDomain(domains) {
 | 
						function matchDomain(domains) {
 | 
				
			||||||
		const hostname = window.location.hostname;
 | 
							const hostname = window.location.hostname;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user