sitemap based feed.

This commit is contained in:
Jason Schwarzenberger 2020-11-03 16:00:03 +13:00
parent de80389ed0
commit 76f1d57702
3 changed files with 120 additions and 1 deletions

View File

@ -7,7 +7,7 @@ import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from feeds import hackernews, reddit, tildes, substack, manual from feeds import hackernews, reddit, tildes, substack, manual, sitemap
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@ -17,11 +17,15 @@ TWO_DAYS = 60*60*24*2
webworm = substack.Publication("https://www.webworm.co") webworm = substack.Publication("https://www.webworm.co")
bulletin = substack.Publication("https://thespinoff.substack.com") bulletin = substack.Publication("https://thespinoff.substack.com")
stuff = sitemap.Sitemap("https://www.stuff.co.nz/sitemap.xml")
nzherald = sitemap.Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
def list(): def list():
feed = [] feed = []
feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] feed += [(x, 'hackernews') for x in hackernews.feed()[:10]]
feed += [(x, 'tildes') for x in tildes.feed()[:10]] feed += [(x, 'tildes') for x in tildes.feed()[:10]]
feed += [(x, 'stuff') for x in stuff.feed()[:10]]
feed += [(x, 'nzherald') for x in nzherald.feed()[:10]]
feed += [(x, 'substack') for x in substack.top.feed()[:15]] feed += [(x, 'substack') for x in substack.top.feed()[:15]]
feed += [(x, 'reddit') for x in reddit.feed()[:15]] feed += [(x, 'reddit') for x in reddit.feed()[:15]]
feed += [(x, 'webworm') for x in webworm.feed()[:15]] feed += [(x, 'webworm') for x in webworm.feed()[:15]]
@ -89,6 +93,10 @@ def update_story(story, is_manual=False):
res = bulletin.story(story['ref']) res = bulletin.story(story['ref'])
elif story['source'] == 'substack': elif story['source'] == 'substack':
res = substack.top.story(story['ref']) res = substack.top.story(story['ref'])
elif story['source'] == 'stuff':
res = stuff.story(story['ref'])
elif story['source'] == 'nzherald':
res = nzherald.story(story['ref'])
elif story['source'] == 'manual': elif story['source'] == 'manual':
res = manual.story(story['ref']) res = manual.story(story['ref'])

110
apiserver/feeds/sitemap.py Normal file
View File

@ -0,0 +1,110 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp())
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def get_article_details(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return (data, "outline")
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return (None, None)
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles]))
if len(articles) == 0:
return False
r = articles[0]
if not r:
return False
(data, method) = get_article_details(ref)
if not data:
return False
if 'outline' not in method:
return False
s = {}
s['author'] = data['author']
s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text)
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = []
s['num_comments'] = 0
s['text'] = data['html']
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

View File

@ -11,6 +11,7 @@ greenlet==0.4.16
idna==2.10 idna==2.10
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.11.2 Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
packaging==20.4 packaging==20.4
praw==6.4.0 praw==6.4.0