コミットを比較

..

8 コミット

作成者 SHA1 メッセージ 日付
Jason Schwarzenberger
736cdc8576 fix mistake. 2020-11-03 17:04:46 +13:00
Jason Schwarzenberger
244d416f6e settings config of sitemap/substack publications. 2020-11-03 17:01:29 +13:00
Jason Schwarzenberger
5f98a2e76a Merge remote-tracking branch 'tanner/master' into master
And adding relevant setings.py.example/etc.
2020-11-03 16:44:02 +13:00
Jason Schwarzenberger
0567cdfd9b move sort to render. 2020-11-03 16:30:22 +13:00
Jason Schwarzenberger
4f90671cec order feed by reverse chronological 2020-11-03 16:21:23 +13:00
Jason Schwarzenberger
e63a1456a5 add logos. 2020-11-03 16:07:07 +13:00
Jason Schwarzenberger
76f1d57702 sitemap based feed. 2020-11-03 16:00:03 +13:00
ca78a6d7a9 Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00
10個のファイルの変更230行の追加41行の削除

ファイルの表示

@@ -35,7 +35,7 @@ $ source env/bin/activate
(env) $ pip install -r requirements.txt (env) $ pip install -r requirements.txt
``` ```
Configure Praw for your Reddit account: Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps * Go to https://www.reddit.com/prefs/apps
* Click "Create app" * Click "Create app"
@@ -44,16 +44,14 @@ Configure Praw for your Reddit account:
* Description: blank * Description: blank
* About URL: blank * About URL: blank
* Redirect URL: your GitHub profile * Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `praw.ini`: * Submit, copy the client ID and client secret into `settings.py` below
```text ```text
(env) $ vim praw.ini (env) $ vim settings.py.example
[bot]
client_id=paste here
client_secret=paste here
user_agent=script by github/your-username-here
``` ```
Edit it and save it as `settings.py`.
Now you can run the server: Now you can run the server:
```text ```text

ファイルの表示

@@ -105,7 +105,7 @@ ENV/
# DB # DB
db.sqlite3 db.sqlite3
praw.ini settings.py
data.db data.db
data.db.bak data.db.bak
data/archive/* data/archive/*

ファイルの表示

@@ -7,7 +7,8 @@ import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from feeds import hackernews, reddit, tildes, substack, manual import settings
from feeds import hackernews, reddit, tildes, substack, manual, sitemap
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@@ -15,17 +16,36 @@ READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 TWO_DAYS = 60*60*24*2
webworm = substack.Publication("https://www.webworm.co") substacks = {}
bulletin = substack.Publication("https://thespinoff.substack.com") for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = sitemap.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] if settings.NUM_HACKERNEWS:
feed += [(x, 'tildes') for x in tildes.feed()[:10]] feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
feed += [(x, 'substack') for x in substack.top.feed()[:15]]
feed += [(x, 'reddit') for x in reddit.feed()[:15]] if settings.NUM_REDDIT:
feed += [(x, 'webworm') for x in webworm.feed()[:15]] feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
feed += [(x, 'the bulletin') for x in bulletin.feed()[:15]]
if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key].count
feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].count
feed += [(x, key) for x in sites.feed()[:count]]
return feed return feed
def get_article(url): def get_article(url):
@@ -83,12 +103,12 @@ def update_story(story, is_manual=False):
res = reddit.story(story['ref']) res = reddit.story(story['ref'])
elif story['source'] == 'tildes': elif story['source'] == 'tildes':
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'webworm':
res = webworm.story(story['ref'])
elif story['source'] == 'the bulletin':
res = bulletin.story(story['ref'])
elif story['source'] == 'substack': elif story['source'] == 'substack':
res = substack.top.story(story['ref']) res = substack.top.story(story['ref'])
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'])
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual': elif story['source'] == 'manual':
res = manual.story(story['ref']) res = manual.story(story['ref'])

ファイルの表示

@@ -12,18 +12,24 @@ from praw.exceptions import PRAWException
from praw.models import MoreComments from praw.models import MoreComments
from prawcore.exceptions import PrawcoreException from prawcore.exceptions import PrawcoreException
import settings
from utils import render_md, clean from utils import render_md, clean
SUBREDDITS = 'newzealand'
SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x) SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x)
SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x) SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x)
reddit = praw.Reddit('bot') if settings.NUM_REDDIT:
reddit = praw.Reddit(
client_id=settings.REDDIT_CLIENT_ID,
client_secret=settings.REDDIT_CLIENT_SECRET,
user_agent=settings.REDDIT_USER_AGENT,
)
subs = '+'.join(settings.SUBREDDITS)
def feed(): def feed():
try: try:
return [x.id for x in reddit.subreddit(SUBREDDITS).hot()] return [x.id for x in reddit.subreddit(subs).hot()]
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except PRAWException as e: except PRAWException as e:

110
apiserver/feeds/sitemap.py ノーマルファイル
ファイルの表示

@@ -0,0 +1,110 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp())
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def get_article_details(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return (data, "outline")
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return (None, None)
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles]))
if len(articles) == 0:
return False
r = articles[0]
if not r:
return False
(data, method) = get_article_details(ref)
if not data:
return False
if 'outline' not in method:
return False
s = {}
s['author'] = data['author']
s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text)
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = []
s['num_comments'] = 0
s['text'] = data['html']
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

ファイルの表示

@@ -1,4 +0,0 @@
[bot]
client_id=
client_secret=
user_agent=

ファイルの表示

@@ -11,6 +11,7 @@ greenlet==0.4.16
idna==2.10 idna==2.10
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.11.2 Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
packaging==20.4 packaging==20.4
praw==6.4.0 praw==6.4.0

51
apiserver/settings.py.example ノーマルファイル
ファイルの表示

@@ -0,0 +1,51 @@
# QotNews settings
# edit this file and save it as settings.py
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site
NUM_HACKERNEWS = 15
NUM_REDDIT = 10
NUM_TILDES = 5
NUM_SUBSTACK = 10
# SITEMAP = {
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
# SUBSTACK = {
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10},
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
# Reddit account info
# leave blank if not using Reddit
REDDIT_CLIENT_ID = ''
REDDIT_CLIENT_SECRET = ''
REDDIT_USER_AGENT = ''
SUBREDDITS = [
'Economics',
'AcademicPhilosophy',
'DepthHub',
'Foodforthought',
'HistoryofIdeas',
'LaymanJournals',
'PhilosophyofScience',
'PoliticsPDFs',
'Scholar',
'StateOfTheUnion',
'TheAgora',
'TrueFilm',
'TrueReddit',
'UniversityofReddit',
'culturalstudies',
'hardscience',
'indepthsports',
'indepthstories',
'ludology',
'neurophilosophy',
'resilientcommunities',
'worldevents',
]

ファイルの表示

@@ -22,20 +22,21 @@ class Feed extends React.Component {
const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id; const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id;
console.log('updated:', updated); console.log('updated:', updated);
this.setState({ stories: result.stories }); const { stories } = result;
localStorage.setItem('stories', JSON.stringify(result.stories)); this.setState({ stories });
localStorage.setItem('stories', JSON.stringify(stories));
if (updated) { if (updated) {
localForage.clear(); localForage.clear();
result.stories.forEach((x, i) => { stories.forEach((x, i) => {
fetch('/api/' + x.id) fetch('/api/' + x.id)
.then(res => res.json()) .then(res => res.json())
.then(result => { .then(({ story }) => {
localForage.setItem(x.id, result.story) localForage.setItem(x.id, story)
.then(console.log('preloaded', x.id, x.title)); .then(console.log('preloaded', x.id, x.title));
this.props.updateCache(x.id, result.story); this.props.updateCache(x.id, story);
}, error => {} }, error => { }
); );
}); });
} }
}, },
@@ -49,6 +50,10 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>
@@ -73,7 +78,7 @@ class Feed extends React.Component {
</div> </div>
)} )}
</div> </div>
: :
<p>loading...</p> <p>loading...</p>
} }
</div> </div>

長すぎる行があるためファイル差分は表示されません