import logging logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) import requests import time from bs4 import BeautifulSoup import itertools import settings from feeds import hackernews, reddit, tildes, substack, manual from feeds.sitemap import Sitemap from feeds.category import Category from scrapers import outline, declutter, headless, simple INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] substacks = {} for key, value in settings.SUBSTACK.items(): substacks[key] = substack.Publication(value['url']) categories = {} for key, value in settings.CATEGORY.items(): categories[key] = Category(value) sitemaps = {} for key, value in settings.SITEMAP.items(): sitemaps[key] = Sitemap(value) def get_list(): feeds = {} if settings.NUM_HACKERNEWS: feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] if settings.NUM_REDDIT: feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]] if settings.NUM_TILDES: feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]] if settings.NUM_SUBSTACK: feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]] for key, publication in substacks.items(): count = settings.SUBSTACK[key]['count'] feeds[key] = [(x, key, x) for x in publication.feed()[:count]] for key, sites in categories.items(): count = settings.CATEGORY[key].get('count') or 0 excludes = settings.CATEGORY[key].get('excludes') tz = settings.CATEGORY[key].get('tz') feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]] for key, sites in sitemaps.items(): count = settings.SITEMAP[key].get('count') or 0 excludes = settings.SITEMAP[key].get('excludes') feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]] values = feeds.values() feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None)) feed = list(filter(None, feed)) return feed def get_article(url): scrapers = { 'headless': headless, 'simple': simple, 'outline': outline, 'declutter': declutter, } available = settings.SCRAPERS or ['headless', 'simple'] if 'simple' not in available: available += ['simple'] for scraper in available: if scraper not in scrapers.keys(): continue try: html = scrapers[scraper].get_html(url) if html: return html except KeyboardInterrupt: raise except: pass return '' def get_content_type(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'X-Forwarded-For': '66.249.66.1', } return requests.get(url, headers=headers, timeout=5).headers['content-type'] except: pass try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'} return requests.get(url, headers=headers, timeout=10).headers['content-type'] except: return '' def update_story(story, is_manual=False, urlref=None): res = {} if story['source'] == 'hackernews': res = hackernews.story(story['ref']) elif story['source'] == 'reddit': res = reddit.story(story['ref']) elif story['source'] == 'tildes': res = tildes.story(story['ref']) elif story['source'] == 'substack': res = substack.top.story(story['ref']) elif story['source'] in categories.keys(): res = categories[story['source']].story(story['ref'], urlref) elif story['source'] in sitemaps.keys(): res = sitemaps[story['source']].story(story['ref'], urlref) elif story['source'] in substacks.keys(): res = substacks[story['source']].story(story['ref']) elif story['source'] == 'manual': res = manual.story(story['ref']) if res: story.update(res) # join dicts else: logging.info('Story not ready yet') return False if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time(): logging.info('Story too old, removing') return False if story.get('url', '') and not story.get('text', ''): if not get_content_type(story['url']).startswith('text/'): logging.info('URL invalid file type / content type:') logging.info(story['url']) return False if any([domain in story['url'] for domain in INVALID_DOMAINS]): logging.info('URL invalid domain:') logging.info(story['url']) return False logging.info('Getting article ' + story['url']) story['text'] = get_article(story['url']) if not story['text']: return False return True if __name__ == '__main__': #test_news_cache = {} #nid = 'jean' #ref = 20802050 #source = 'hackernews' #test_news_cache[nid] = dict(id=nid, ref=ref, source=source) #news_story = test_news_cache[nid] #update_story(news_story) #print(get_article('https://www.bloomberg.com/news/articles/2019-09-23/xi-s-communists-under-pressure-as-high-prices-hit-china-workers')) a = get_article('https://blog.joinmastodon.org/2019/10/mastodon-3.0/') print(a) print('done')