forked from tanner/qotnews
Compare commits
8 Commits
de80389ed0
...
736cdc8576
Author | SHA1 | Date |
---|---|---|
Jason Schwarzenberger | 736cdc8576 | 4 years ago |
Jason Schwarzenberger | 244d416f6e | 4 years ago |
Jason Schwarzenberger | 5f98a2e76a | 4 years ago |
Jason Schwarzenberger | 0567cdfd9b | 4 years ago |
Jason Schwarzenberger | 4f90671cec | 4 years ago |
Jason Schwarzenberger | e63a1456a5 | 4 years ago |
Jason Schwarzenberger | 76f1d57702 | 4 years ago |
Tanner Collin | ca78a6d7a9 | 4 years ago |
10 changed files with 230 additions and 41 deletions
@ -0,0 +1,110 @@ |
|||||||
|
import logging |
||||||
|
logging.basicConfig( |
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||||
|
level=logging.DEBUG) |
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
import sys |
||||||
|
sys.path.insert(0,'.') |
||||||
|
|
||||||
|
import requests |
||||||
|
from datetime import datetime |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
from utils import clean |
||||||
|
|
||||||
|
OUTLINE_API = 'https://api.outline.com/v3/parse_article' |
||||||
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' |
||||||
|
|
||||||
|
def unix(date_str): |
||||||
|
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp()) |
||||||
|
|
||||||
|
def xml(route, ref=None): |
||||||
|
try: |
||||||
|
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} |
||||||
|
r = requests.get(route(ref), headers=headers, timeout=5) |
||||||
|
if r.status_code != 200: |
||||||
|
raise Exception('Bad response code ' + str(r.status_code)) |
||||||
|
return r.text |
||||||
|
except KeyboardInterrupt: |
||||||
|
raise |
||||||
|
except BaseException as e: |
||||||
|
logging.error('Problem hitting URL: {}'.format(str(e))) |
||||||
|
return False |
||||||
|
|
||||||
|
def get_article_details(url): |
||||||
|
try: |
||||||
|
params = {'source_url': url} |
||||||
|
headers = {'Referer': 'https://outline.com/'} |
||||||
|
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) |
||||||
|
if r.status_code == 429: |
||||||
|
logging.info('Rate limited by outline, sleeping 30s and skipping...') |
||||||
|
time.sleep(30) |
||||||
|
return '' |
||||||
|
if r.status_code != 200: |
||||||
|
raise Exception('Bad response code ' + str(r.status_code)) |
||||||
|
data = r.json()['data'] |
||||||
|
if 'URL is not supported by Outline' in data['html']: |
||||||
|
raise Exception('URL not supported by Outline') |
||||||
|
return (data, "outline") |
||||||
|
except KeyboardInterrupt: |
||||||
|
raise |
||||||
|
except BaseException as e: |
||||||
|
logging.error('Problem outlining article: {}'.format(str(e))) |
||||||
|
return (None, None) |
||||||
|
|
||||||
|
|
||||||
|
class Sitemap: |
||||||
|
def __init__(self, url): |
||||||
|
self.sitemap_url = url |
||||||
|
|
||||||
|
def feed(self): |
||||||
|
markup = xml(lambda x: self.sitemap_url) |
||||||
|
if not markup: return [] |
||||||
|
soup = BeautifulSoup(markup, features='lxml') |
||||||
|
articles = soup.find('urlset').findAll('url') |
||||||
|
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) |
||||||
|
return [x.find('loc').text for x in articles] or [] |
||||||
|
|
||||||
|
def story(self, ref): |
||||||
|
markup = xml(lambda x: self.sitemap_url) |
||||||
|
if not markup: return [] |
||||||
|
soup = BeautifulSoup(markup, features='lxml') |
||||||
|
articles = soup.find('urlset').findAll('url') |
||||||
|
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) |
||||||
|
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles])) |
||||||
|
|
||||||
|
if len(articles) == 0: |
||||||
|
return False |
||||||
|
|
||||||
|
r = articles[0] |
||||||
|
if not r: |
||||||
|
return False |
||||||
|
|
||||||
|
(data, method) = get_article_details(ref) |
||||||
|
if not data: |
||||||
|
return False |
||||||
|
if 'outline' not in method: |
||||||
|
return False |
||||||
|
s = {} |
||||||
|
s['author'] = data['author'] |
||||||
|
s['author_link'] = '' |
||||||
|
s['date'] = unix(r.find('lastmod').text) |
||||||
|
s['score'] = 0 |
||||||
|
s['title'] = data['title'] |
||||||
|
s['link'] = data['article_url'] |
||||||
|
s['url'] = data['article_url'] |
||||||
|
s['comments'] = [] |
||||||
|
s['num_comments'] = 0 |
||||||
|
s['text'] = data['html'] |
||||||
|
|
||||||
|
return s |
||||||
|
|
||||||
|
|
||||||
|
# scratchpad so I can quickly develop the parser |
||||||
|
if __name__ == '__main__': |
||||||
|
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml") |
||||||
|
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") |
||||||
|
posts = site.feed() |
||||||
|
print(posts[:1]) |
||||||
|
print(site.story(posts[0])) |
@ -1,4 +0,0 @@ |
|||||||
[bot] |
|
||||||
client_id= |
|
||||||
client_secret= |
|
||||||
user_agent= |
|
@ -0,0 +1,51 @@ |
|||||||
|
# QotNews settings |
||||||
|
# edit this file and save it as settings.py |
||||||
|
|
||||||
|
# Feed Lengths |
||||||
|
# Number of top items from each site to pull |
||||||
|
# set to 0 to disable that site |
||||||
|
NUM_HACKERNEWS = 15 |
||||||
|
NUM_REDDIT = 10 |
||||||
|
NUM_TILDES = 5 |
||||||
|
NUM_SUBSTACK = 10 |
||||||
|
|
||||||
|
# SITEMAP = { |
||||||
|
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10}, |
||||||
|
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10}, |
||||||
|
# } |
||||||
|
|
||||||
|
# SUBSTACK = { |
||||||
|
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10}, |
||||||
|
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10}, |
||||||
|
# } |
||||||
|
|
||||||
|
# Reddit account info |
||||||
|
# leave blank if not using Reddit |
||||||
|
REDDIT_CLIENT_ID = '' |
||||||
|
REDDIT_CLIENT_SECRET = '' |
||||||
|
REDDIT_USER_AGENT = '' |
||||||
|
|
||||||
|
SUBREDDITS = [ |
||||||
|
'Economics', |
||||||
|
'AcademicPhilosophy', |
||||||
|
'DepthHub', |
||||||
|
'Foodforthought', |
||||||
|
'HistoryofIdeas', |
||||||
|
'LaymanJournals', |
||||||
|
'PhilosophyofScience', |
||||||
|
'PoliticsPDFs', |
||||||
|
'Scholar', |
||||||
|
'StateOfTheUnion', |
||||||
|
'TheAgora', |
||||||
|
'TrueFilm', |
||||||
|
'TrueReddit', |
||||||
|
'UniversityofReddit', |
||||||
|
'culturalstudies', |
||||||
|
'hardscience', |
||||||
|
'indepthsports', |
||||||
|
'indepthstories', |
||||||
|
'ludology', |
||||||
|
'neurophilosophy', |
||||||
|
'resilientcommunities', |
||||||
|
'worldevents', |
||||||
|
] |
File diff suppressed because one or more lines are too long
Loading…
Reference in new issue