From 9bfc6fc6fa0502c9af68da3cd9cdf22835a066b8 Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Wed, 4 Nov 2020 15:47:12 +1300 Subject: [PATCH] scraper settings, ordering and loop. --- apiserver/feed.py | 39 +++++++++++++++++------------------ apiserver/settings.py.example | 2 ++ 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/apiserver/feed.py b/apiserver/feed.py index 0f2a770..08bfe4f 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -54,27 +54,26 @@ def list(): return feed def get_article(url): - try: - return declutter.get_html(url) - except KeyboardInterrupt: - raise - except: - pass - - try: - return outline.get_html(url) - except KeyboardInterrupt: - raise - except: - pass - - try: - return local.get_html(url) - except KeyboardInterrupt: - raise - except: - pass + scrapers = { + 'declutter': declutter, + 'outline': outline, + 'local': local, + } + available = settings.SCRAPERS or ['local'] + if 'local' not in available: + available += ['local'] + for scraper in available: + if scraper not in scrapers.keys(): + continue + try: + html = scrapers[scraper].get_html(url) + if html: + return html + except KeyboardInterrupt: + raise + except: + pass return '' def get_content_type(url): diff --git a/apiserver/settings.py.example b/apiserver/settings.py.example index ea6c42f..d119818 100644 --- a/apiserver/settings.py.example +++ b/apiserver/settings.py.example @@ -23,6 +23,8 @@ NUM_SUBSTACK = 10 # 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, # } +SCRAPERS = ['declutter', 'outline', 'local'] + # Reddit account info # leave blank if not using Reddit REDDIT_CLIENT_ID = ''