From b771b52501657d182b9fa6f9d2bbc2ef92a12e92 Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Tue, 17 Nov 2020 12:38:28 +1300 Subject: [PATCH] add regex to get a unique ref from each sitemap/category based article url. --- apiserver/database.py | 7 ++++--- apiserver/feed.py | 24 ++++++++++++------------ apiserver/feeds/category.py | 10 ++++++---- apiserver/feeds/sitemap.py | 10 ++++++---- apiserver/misc/news.py | 33 ++++++++++++++++++++++++--------- apiserver/server.py | 6 +++--- apiserver/settings.py.example | 34 +++++++++++++++++++++++++++++++--- 7 files changed, 86 insertions(+), 38 deletions(-) diff --git a/apiserver/database.py b/apiserver/database.py index b835811..6ab5aaa 100644 --- a/apiserver/database.py +++ b/apiserver/database.py @@ -24,6 +24,7 @@ class Reflist(Base): rid = Column(Integer, primary_key=True) ref = Column(String(16), unique=True) + urlref = Column(String) sid = Column(String, ForeignKey('stories.sid'), unique=True) source = Column(String(16)) @@ -75,7 +76,7 @@ def get_stories_by_url(url): def get_reflist(): session = Session() q = session.query(Reflist).order_by(Reflist.rid.desc()) - return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] + return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()] def get_stories(maxage=60*60*24*2): time = datetime.now().timestamp() - maxage @@ -87,10 +88,10 @@ def get_stories(maxage=60*60*24*2): order_by(Story.meta['date'].desc()) return [x[1] for x in q] -def put_ref(ref, sid, source): +def put_ref(ref, sid, source, urlref): try: session = Session() - r = Reflist(ref=ref, sid=sid, source=source) + r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref) session.add(r) session.commit() except: diff --git a/apiserver/feed.py b/apiserver/feed.py index 9314a9f..5a5b4c0 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -21,40 +21,40 @@ for key, value in settings.SUBSTACK.items(): substacks[key] = substack.Publication(value['url']) categories = {} for key, value in settings.CATEGORY.items(): - categories[key] = Category(value['url'], value.get('tz')) + categories[key] = Category(value) sitemaps = {} for key, value in settings.SITEMAP.items(): - sitemaps[key] = Sitemap(value['url'], value.get('tz')) + sitemaps[key] = Sitemap(value) def get_list(): feeds = {} if settings.NUM_HACKERNEWS: - feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] + feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] if settings.NUM_REDDIT: - feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]] + feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]] if settings.NUM_TILDES: - feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] + feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]] if settings.NUM_SUBSTACK: - feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] + feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]] for key, publication in substacks.items(): count = settings.SUBSTACK[key]['count'] - feeds[key] = [(x, key) for x in publication.feed()[:count]] + feeds[key] = [(x, key, u) for x, u in publication.feed()[:count]] for key, sites in categories.items(): count = settings.CATEGORY[key].get('count') or 0 excludes = settings.CATEGORY[key].get('excludes') tz = settings.CATEGORY[key].get('tz') - feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] + feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]] for key, sites in sitemaps.items(): count = settings.SITEMAP[key].get('count') or 0 excludes = settings.SITEMAP[key].get('excludes') - feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]] + feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]] values = feeds.values() feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None)) @@ -101,7 +101,7 @@ def get_content_type(url): except: return '' -def update_story(story, is_manual=False): +def update_story(story, is_manual=False, urlref=None): res = {} if story['source'] == 'hackernews': @@ -113,9 +113,9 @@ def update_story(story, is_manual=False): elif story['source'] == 'substack': res = substack.top.story(story['ref']) elif story['source'] in categories.keys(): - res = categories[story['source']].story(story['ref']) + res = categories[story['source']].story(story['ref'], urlref) elif story['source'] in sitemaps.keys(): - res = sitemaps[story['source']].story(story['ref']) + res = sitemaps[story['source']].story(story['ref'], urlref) elif story['source'] in substacks.keys(): res = substacks[story['source']].story(story['ref']) elif story['source'] == 'manual': diff --git a/apiserver/feeds/category.py b/apiserver/feeds/category.py index ec6aca5..23dd1f2 100644 --- a/apiserver/feeds/category.py +++ b/apiserver/feeds/category.py @@ -34,9 +34,10 @@ def _get_category(category_url, excludes=None): return links class Category(Base): - def __init__(self, url, tz=None): - self.tz = tz - self.category_url = url + def __init__(self, config): + self.config = config + self.category_url = config.get('url') + self.tz = config.get('tz') def feed(self, excludes=None): links = [] @@ -45,7 +46,8 @@ class Category(Base): elif isinstance(self.category_url, list): for url in self.category_url: links += _get_category(url, excludes) - return list(set(links)) + links = list(set(links)) + return [(self.get_id(link), link) for link in links] # scratchpad so I can quickly develop the parser diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py index f0f807c..4efe47d 100644 --- a/apiserver/feeds/sitemap.py +++ b/apiserver/feeds/sitemap.py @@ -58,9 +58,10 @@ def _get_sitemap(feed_url, excludes=None): return list(set(links)) class Sitemap(Base): - def __init__(self, url, tz=None): - self.tz = tz - self.sitemap_url = url + def __init__(self, config): + self.config = config + self.sitemap_url = config.get('url') + self.tz = config.get('tz') def feed(self, excludes=None): links = [] @@ -69,7 +70,8 @@ class Sitemap(Base): elif isinstance(self.sitemap_url, list): for url in self.sitemap_url: links += _get_sitemap(url, excludes) - return list(set(links)) + links = list(set(links)) + return [(self.get_id(link), link) for link in links] # scratchpad so I can quickly develop the parser if __name__ == '__main__': diff --git a/apiserver/misc/news.py b/apiserver/misc/news.py index 4d89ad8..8d32143 100644 --- a/apiserver/misc/news.py +++ b/apiserver/misc/news.py @@ -3,6 +3,7 @@ logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) +import re import requests from bs4 import BeautifulSoup from scrapers import declutter @@ -32,15 +33,29 @@ def comment_count(i): return sum([comment_count(c) for c in i['comments']]) + alive class Base: - def __init__(url, tz=None): - self.url = url - self.tz = tz + def __init__(config): + self.config = config + self.url = config.get('url') + self.tz = config.get('tz') + + def get_id(self, link): + patterns = self.config.get('patterns') + if not patterns: + return link + patterns = [re.compile(p) for p in patterns] + patterns = list(filter(None, [p.match(link) for p in patterns])) + patterns = list(set([':'.join(p.groups()) for p in patterns])) + if not patterns: + return link + return patterns[0] def feed(self, excludes=None): return [] - def story(self, ref): - markup = xml(lambda x: ref) + def story(self, ref, urlref): + if urlref is None: + return False + markup = xml(lambda x: urlref) if not markup: return False @@ -49,8 +64,8 @@ class Base: s['score'] = 0 s['comments'] = [] s['num_comments'] = 0 - s['link'] = ref - s['url'] = ref + s['link'] = urlref + s['url'] = urlref s['date'] = 0 soup = BeautifulSoup(markup, features='html.parser') @@ -59,7 +74,7 @@ class Base: favicon = soup.find_all('link', rel="shortcut icon", href=True) others = soup.find_all('link', rel="icon", href=True) icons = icon32 + icon16 + favicon + others - base_url = '/'.join(ref.split('/')[:3]) + base_url = '/'.join(urlref.split('/')[:3]) icons = list(set([i.get('href') for i in icons])) icons = [i if i.startswith('http') else base_url + i for i in icons] @@ -73,7 +88,7 @@ class Base: if 'disqus' in markup: try: - s['comments'] = declutter.get_comments(ref) + s['comments'] = declutter.get_comments(urlref) c['comments'] = list(filter(bool, c['comments'])) s['num_comments'] = comment_count(s['comments']) except KeyboardInterrupt: diff --git a/apiserver/server.py b/apiserver/server.py index 7d35ffb..ea09d62 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -145,12 +145,12 @@ def static_story(sid): http_server = WSGIServer(('', 33842), flask_app) def _add_new_refs(): - for ref, source in feed.get_list(): + for ref, source, urlref in feed.get_list(): if database.get_story_by_ref(ref): continue try: nid = new_id() - database.put_ref(ref, nid, source) + database.put_ref(ref, nid, source, urlref) logging.info('Added ref ' + ref) except database.IntegrityError: continue @@ -163,7 +163,7 @@ def _update_current_story(item): logging.info('Updating story: {}'.format(str(story['ref']))) - valid = feed.update_story(story) + valid = feed.update_story(story, urlref=item['urlref']) if valid: database.put_story(story) search.put_story(story) diff --git a/apiserver/settings.py.example b/apiserver/settings.py.example index ade6040..87d608d 100644 --- a/apiserver/settings.py.example +++ b/apiserver/settings.py.example @@ -13,15 +13,43 @@ NUM_TILDES = 5 NUM_SUBSTACK = 10 SITEMAP = {} -# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10}, -# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10}, +# SITEMAP['nzherald'] = { +# 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", +# 'count': 20, +# 'patterns': [ +# r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$', +# ], +# 'excludes': [ +# 'driven.co.nz', +# 'oneroof.co.nz', +# 'nzherald.co.nz/sponsored-stories', +# 'nzherald.co.nz/entertainment/', +# 'nzherald.co.nz/lifestyle/', +# 'nzherald.co.nz/travel/', +# 'nzherald.co.nz/sport/', +# 'nzherald.co.nz/promotions/', +# 'nzherald.co.nzhttp', +# 'herald-afternoon-quiz', +# 'herald-morning-quiz' +# ], +# } SUBSTACK = {} # SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10}, # SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10}, CATEGORY = {} -# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, +# CATEGORY['radionz'] = { +# 'url': "https://www.rnz.co.nz/news/", +# 'count': 20, +# 'patterns': [ +# r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?' +# ], +# 'excludes': [ +# 'rnz.co.nz/news/sport', +# 'rnz.co.nz/weather', +# ], +# } SCRAPERS = ['browser', 'declutter', 'outline', 'local']