You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

187 lines
6.4 KiB

import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
import time
from bs4 import BeautifulSoup
import itertools
import settings
from feeds import hackernews, reddit, tildes, substack, manual, lobsters
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline
from scrapers.declutter import declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
substacks = {}
for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = Category(value)
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = Sitemap(value)
def get_list():
feeds = {}
if settings.NUM_HACKERNEWS:
feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_LOBSTERS:
feed += [(x, 'lobsters', x) for x in lobsters.feed()[:settings.NUM_LOBSTERS]]
if settings.NUM_REDDIT:
feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES:
feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count']
feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes')
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
feed = list(filter(None, feed))
return feed
def get_article(url):
scrapers = {
'headless': headless,
'simple': simple,
'outline': outline,
'declutter': declutter,
}
available = settings.SCRAPERS or ['headless', 'simple']
if 'simple' not in available:
available += ['simple']
for scraper in available:
if scraper not in scrapers.keys():
continue
try:
details = scrapers[scraper].get_details(url)
if details and details.get('content'):
return details, scraper
except KeyboardInterrupt:
raise
except:
pass
return None, None
def get_content_type(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
}
return requests.get(url, headers=headers, timeout=5).headers['content-type']
except:
pass
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'}
return requests.get(url, headers=headers, timeout=10).headers['content-type']
except:
return ''
def update_story(story, is_manual=False, urlref=None):
res = {}
if story['source'] == 'hackernews':
res = hackernews.story(story['ref'])
elif story['source'] == 'lobsters':
res = lobsters.story(story['ref'])
elif story['source'] == 'reddit':
res = reddit.story(story['ref'])
elif story['source'] == 'tildes':
res = tildes.story(story['ref'])
elif story['source'] == 'substack':
res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'], urlref)
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'], urlref)
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual':
res = manual.story(story['ref'])
if res:
story.update(res) # join dicts
else:
logging.info('Story not ready yet')
return False
if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
logging.info('Story too old, removing')
return False
has_url = story.get('url') or False
has_text = story.get('text') or False
#is_simple = story.get('scaper', '') == 'simple'
if has_url and not has_text:
if not get_content_type(story['url']).startswith('text/'):
logging.info('URL invalid file type / content type:')
logging.info(story['url'])
return False
if any([domain in story['url'] for domain in INVALID_DOMAINS]):
logging.info('URL invalid domain:')
logging.info(story['url'])
return False
logging.info('Getting article ' + story['url'])
details, scraper = get_article(story['url'])
if not details: return False
story['scraper'] = scraper
story['text'] = details.get('content', '')
if not story['text']: return False
story['last_update'] = time.time()
story['excerpt'] = details.get('excerpt', '')
story['scraper_link'] = details.get('scraper_link', '')
meta = details.get('meta')
if meta:
og = meta.get('og')
story['image'] = meta.get('image', '')
if og:
story['image'] = og.get('og:image', meta.get('image', ''))
return True
if __name__ == '__main__':
#test_news_cache = {}
#nid = 'jean'
#ref = 20802050
#source = 'hackernews'
#test_news_cache[nid] = dict(id=nid, ref=ref, source=source)
#news_story = test_news_cache[nid]
#update_story(news_story)
#print(get_article('https://www.bloomberg.com/news/articles/2019-09-23/xi-s-communists-under-pressure-as-high-prices-hit-china-workers'))
a = get_article('https://blog.joinmastodon.org/2019/10/mastodon-3.0/')
print(a)
print('done')