qotnews/apiserver/feed.py

import logging
logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)

import requests
import time
from bs4 import BeautifulSoup
import itertools

import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, headless, simple

INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']

substacks = {}
for key, value in settings.SUBSTACK.items():
    substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
    categories[key] = Category(value)
sitemaps = {}
for key, value in settings.SITEMAP.items():
    sitemaps[key] = Sitemap(value)

def get_list():
    feeds = {}

    if settings.NUM_HACKERNEWS:
        feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]

    if settings.NUM_REDDIT:
        feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]

    if settings.NUM_TILDES:
        feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]

    if settings.NUM_SUBSTACK:
        feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]

    for key, publication in substacks.items():
        count = settings.SUBSTACK[key]['count']
        feeds[key] = [(x, key, x) for x in publication.feed()[:count]]

    for key, sites in categories.items():
        count = settings.CATEGORY[key].get('count') or 0
        excludes = settings.CATEGORY[key].get('excludes')
        tz = settings.CATEGORY[key].get('tz')
        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]

    for key, sites in sitemaps.items():
        count = settings.SITEMAP[key].get('count') or 0
        excludes = settings.SITEMAP[key].get('excludes')
        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]

    values = feeds.values()
    feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
    feed = list(filter(None, feed))
    return feed

def get_article(url):
    scrapers = {
        'headless': headless,
        'simple': simple,
        'outline': outline,
        'declutter': declutter,
    }
    available = settings.SCRAPERS or ['headless', 'simple']
    if 'simple' not in available:
        available += ['simple']

    for scraper in available:
        if scraper not in scrapers.keys():
            continue
        try:
            html = scrapers[scraper].get_html(url)
            if html:
                return html
        except KeyboardInterrupt:
            raise
        except:
            pass
    return ''

def get_content_type(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
            'X-Forwarded-For': '66.249.66.1',
        }
        return requests.get(url, headers=headers, timeout=5).headers['content-type']
    except:
        pass

    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'}
        return requests.get(url, headers=headers, timeout=10).headers['content-type']
    except:
        return ''

def update_story(story, is_manual=False, urlref=None):
    res = {}

    if story['source'] == 'hackernews':
        res = hackernews.story(story['ref'])
    elif story['source'] == 'reddit':
        res = reddit.story(story['ref'])
    elif story['source'] == 'tildes':
        res = tildes.story(story['ref'])
    elif story['source'] == 'substack':
        res = substack.top.story(story['ref'])
    elif story['source'] in categories.keys():
        res = categories[story['source']].story(story['ref'], urlref)
    elif story['source'] in sitemaps.keys():
        res = sitemaps[story['source']].story(story['ref'], urlref)
    elif story['source'] in substacks.keys():
        res = substacks[story['source']].story(story['ref'])
    elif story['source'] == 'manual':
        res = manual.story(story['ref'])

    if res:
        story.update(res) # join dicts
    else:
        logging.info('Story not ready yet')
        return False

    if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
        logging.info('Story too old, removing')
        return False

    if story.get('url', '') and not story.get('text', ''):
        if not get_content_type(story['url']).startswith('text/'):
            logging.info('URL invalid file type / content type:')
            logging.info(story['url'])
            return False

        if any([domain in story['url'] for domain in INVALID_DOMAINS]):
            logging.info('URL invalid domain:')
            logging.info(story['url'])
            return False

        logging.info('Getting article ' + story['url'])
        story['text'] = get_article(story['url'])
        if not story['text']: return False

    return True

if __name__ == '__main__':
    #test_news_cache = {}
    #nid = 'jean'
    #ref = 20802050
    #source = 'hackernews'
    #test_news_cache[nid] = dict(id=nid, ref=ref, source=source)
    #news_story = test_news_cache[nid]
    #update_story(news_story)

    #print(get_article('https://www.bloomberg.com/news/articles/2019-09-23/xi-s-communists-under-pressure-as-high-prices-hit-china-workers'))

    a = get_article('https://blog.joinmastodon.org/2019/10/mastodon-3.0/')
    print(a)

    print('done')
Abstract api server feeds 2019-08-24 08:49:11 +00:00			`import logging`
			`logging.basicConfig(`
			`format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',`
Render reddit markdown, poll tildes better, add utils 2019-08-28 04:13:02 +00:00			`level=logging.DEBUG)`
Abstract api server feeds 2019-08-24 08:49:11 +00:00
			`import requests`
Try outline.com for reader mode first 2019-08-25 23:49:08 +00:00			`import time`
Prefetch first images 2019-10-19 07:33:06 +00:00			`from bs4 import BeautifulSoup`
try to make feed only determined by the max age. 2020-11-09 04:50:58 +00:00			`import itertools`
Abstract api server feeds 2019-08-24 08:49:11 +00:00
Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00			`import settings`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`from feeds import hackernews, reddit, tildes, substack, manual`
			`from feeds.sitemap import Sitemap`
			`from feeds.category import Category`
renaming things. 2020-11-17 02:50:31 +00:00			`from scrapers import outline, declutter, headless, simple`
Abstract api server feeds 2019-08-24 08:49:11 +00:00
Stop using archive.is on articles (hits CAPTCHAs) 2019-12-15 22:47:33 +00:00			`INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']`
Ignore certain files and domains, remove refs 2019-09-24 08:22:06 +00:00
settings config of sitemap/substack publications. 2020-11-03 04:01:29 +00:00			`substacks = {}`
fix mistake. 2020-11-03 04:04:46 +00:00			`for key, value in settings.SUBSTACK.items():`
settings config of sitemap/substack publications. 2020-11-03 04:01:29 +00:00			`substacks[key] = substack.Publication(value['url'])`
add news site categories feed. 2020-11-03 22:08:50 +00:00			`categories = {}`
			`for key, value in settings.CATEGORY.items():`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`categories[key] = Category(value)`
settings config of sitemap/substack publications. 2020-11-03 04:01:29 +00:00			`sitemaps = {}`
fix mistake. 2020-11-03 04:04:46 +00:00			`for key, value in settings.SITEMAP.items():`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`sitemaps[key] = Sitemap(value)`
add substack.py top sites, replacing webworm.py 2020-11-02 23:28:39 +00:00
try to make feed only determined by the max age. 2020-11-09 04:50:58 +00:00			`def get_list():`
			`feeds = {}`
local browser scraper 2020-11-11 09:26:54 +00:00
Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00			`if settings.NUM_HACKERNEWS:`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]`
Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00
			`if settings.NUM_REDDIT:`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]`
Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00
			`if settings.NUM_TILDES:`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]`
Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00
Merge remote-tracking branch 'tanner/master' into master And adding relevant setings.py.example/etc. 2020-11-03 03:44:02 +00:00			`if settings.NUM_SUBSTACK:`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]`
Merge remote-tracking branch 'tanner/master' into master And adding relevant setings.py.example/etc. 2020-11-03 03:44:02 +00:00
fix mistake. 2020-11-03 04:04:46 +00:00			`for key, publication in substacks.items():`
use extruct for opengraph/json-ld/microdata of articles 2020-11-03 10:31:36 +00:00			`count = settings.SUBSTACK[key]['count']`
fix mistake. 2020-11-16 23:54:54 +00:00			`feeds[key] = [(x, key, x) for x in publication.feed()[:count]]`
Merge remote-tracking branch 'tanner/master' into master And adding relevant setings.py.example/etc. 2020-11-03 03:44:02 +00:00
add news site categories feed. 2020-11-03 22:08:50 +00:00			`for key, sites in categories.items():`
add an `excludes` list of substrings for urls in the settings for sitemap/category. 2020-11-05 02:51:59 +00:00			`count = settings.CATEGORY[key].get('count') or 0`
			`excludes = settings.CATEGORY[key].get('excludes')`
tz aware for use in settings. 2020-11-05 03:30:55 +00:00			`tz = settings.CATEGORY[key].get('tz')`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]`
add news site categories feed. 2020-11-03 22:08:50 +00:00
fix mistake. 2020-11-03 04:04:46 +00:00			`for key, sites in sitemaps.items():`
add an `excludes` list of substrings for urls in the settings for sitemap/category. 2020-11-05 02:51:59 +00:00			`count = settings.SITEMAP[key].get('count') or 0`
			`excludes = settings.SITEMAP[key].get('excludes')`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]`
Merge remote-tracking branch 'tanner/master' into master And adding relevant setings.py.example/etc. 2020-11-03 03:44:02 +00:00
try to make feed only determined by the max age. 2020-11-09 04:50:58 +00:00			`values = feeds.values()`
			`feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))`
			`feed = list(filter(None, feed))`
Abstract api server feeds 2019-08-24 08:49:11 +00:00			`return feed`

			`def get_article(url):`
scraper settings, ordering and loop. 2020-11-04 02:47:12 +00:00			`scrapers = {`
renaming things. 2020-11-17 02:50:31 +00:00			`'headless': headless,`
			`'simple': simple,`
scraper settings, ordering and loop. 2020-11-04 02:47:12 +00:00			`'outline': outline,`
renaming things. 2020-11-17 02:50:31 +00:00			`'declutter': declutter,`
scraper settings, ordering and loop. 2020-11-04 02:47:12 +00:00			`}`
renaming things. 2020-11-17 02:50:31 +00:00			`available = settings.SCRAPERS or ['headless', 'simple']`
			`if 'simple' not in available:`
			`available += ['simple']`
scraper settings, ordering and loop. 2020-11-04 02:47:12 +00:00
			`for scraper in available:`
			`if scraper not in scrapers.keys():`
			`continue`
			`try:`
			`html = scrapers[scraper].get_html(url)`
			`if html:`
			`return html`
			`except KeyboardInterrupt:`
			`raise`
			`except:`
			`pass`
move scraping for article content to files. 2020-11-04 02:00:58 +00:00			`return ''`
Abstract api server feeds 2019-08-24 08:49:11 +00:00
Check content-type 2020-06-25 23:36:47 +00:00			`def get_content_type(url):`
			`try:`
Add header to get content type 2020-11-03 20:27:43 +00:00			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',`
			`'X-Forwarded-For': '66.249.66.1',`
			`}`
			`return requests.get(url, headers=headers, timeout=5).headers['content-type']`
Check content-type 2020-06-25 23:36:47 +00:00			`except:`
Add requests timeouts and temporary logging 2020-07-04 00:25:41 +00:00			`pass`

			`try:`
			`headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'}`
Adjust content-type request timeout 2020-08-14 03:57:43 +00:00			`return requests.get(url, headers=headers, timeout=10).headers['content-type']`
Add requests timeouts and temporary logging 2020-07-04 00:25:41 +00:00			`except:`
Adjust content-type request timeout 2020-08-14 03:57:43 +00:00			`return ''`
Check content-type 2020-06-25 23:36:47 +00:00
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`def update_story(story, is_manual=False, urlref=None):`
Abstract api server feeds 2019-08-24 08:49:11 +00:00			`res = {}`

			`if story['source'] == 'hackernews':`
			`res = hackernews.story(story['ref'])`
Add reddit to feeds 2019-08-24 21:37:43 +00:00			`elif story['source'] == 'reddit':`
			`res = reddit.story(story['ref'])`
Add tildes to feeds 2019-08-25 00:36:26 +00:00			`elif story['source'] == 'tildes':`
			`res = tildes.story(story['ref'])`
add substack.py top sites, replacing webworm.py 2020-11-02 23:28:39 +00:00			`elif story['source'] == 'substack':`
			`res = substack.top.story(story['ref'])`
add news site categories feed. 2020-11-03 22:08:50 +00:00			`elif story['source'] in categories.keys():`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`res = categories[story['source']].story(story['ref'], urlref)`
settings config of sitemap/substack publications. 2020-11-03 04:01:29 +00:00			`elif story['source'] in sitemaps.keys():`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`res = sitemaps[story['source']].story(story['ref'], urlref)`
settings config of sitemap/substack publications. 2020-11-03 04:01:29 +00:00			`elif story['source'] in substacks.keys():`
			`res = substacks[story['source']].story(story['ref'])`
Allow manual submission of articles 2019-11-08 05:55:30 +00:00			`elif story['source'] == 'manual':`
			`res = manual.story(story['ref'])`
Abstract api server feeds 2019-08-24 08:49:11 +00:00
			`if res:`
Ignore certain files and domains, remove refs 2019-09-24 08:22:06 +00:00			`story.update(res) # join dicts`
			`else:`
Add requests timeouts and temporary logging 2020-07-04 00:25:41 +00:00			`logging.info('Story not ready yet')`
Ignore certain files and domains, remove refs 2019-09-24 08:22:06 +00:00			`return False`

remove limit. 2020-11-09 04:54:50 +00:00			`if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():`
Add requests timeouts and temporary logging 2020-07-04 00:25:41 +00:00			`logging.info('Story too old, removing')`
Drop articles more than two days old 2019-11-08 21:50:33 +00:00			`return False`

Abstract api server feeds 2019-08-24 08:49:11 +00:00			`if story.get('url', '') and not story.get('text', ''):`
Check content-type 2020-06-25 23:36:47 +00:00			`if not get_content_type(story['url']).startswith('text/'):`
			`logging.info('URL invalid file type / content type:')`
			`logging.info(story['url'])`
Archive WSJ articles first, catch KeyboardInterrupt 2019-10-15 21:03:47 +00:00			`return False`

			`if any([domain in story['url'] for domain in INVALID_DOMAINS]):`
Check content-type 2020-06-25 23:36:47 +00:00			`logging.info('URL invalid domain:')`
			`logging.info(story['url'])`
Archive WSJ articles first, catch KeyboardInterrupt 2019-10-15 21:03:47 +00:00			`return False`
Ignore certain files and domains, remove refs 2019-09-24 08:22:06 +00:00
			`logging.info('Getting article ' + story['url'])`
			`story['text'] = get_article(story['url'])`
			`if not story['text']: return False`

			`return True`
Render reddit markdown, poll tildes better, add utils 2019-08-28 04:13:02 +00:00
			`if __name__ == '__main__':`
Archive Bloomberg articles first 2019-10-08 08:00:50 +00:00			`#test_news_cache = {}`
			`#nid = 'jean'`
			`#ref = 20802050`
			`#source = 'hackernews'`
			`#test_news_cache[nid] = dict(id=nid, ref=ref, source=source)`
			`#news_story = test_news_cache[nid]`
			`#update_story(news_story)`

Prefetch first images 2019-10-19 07:33:06 +00:00			`#print(get_article('https://www.bloomberg.com/news/articles/2019-09-23/xi-s-communists-under-pressure-as-high-prices-hit-china-workers'))`

			`a = get_article('https://blog.joinmastodon.org/2019/10/mastodon-3.0/')`
			`print(a)`
Archive Bloomberg articles first 2019-10-08 08:00:50 +00:00
Render reddit markdown, poll tildes better, add utils 2019-08-28 04:13:02 +00:00			`print('done')`