Compare commits
3 Commits
0d16bec6f6
...
e59acefda9
Author | SHA1 | Date | |
---|---|---|---|
e59acefda9 | |||
cbc802b7e9 | |||
4579dfce00 |
|
@ -1,52 +0,0 @@
|
||||||
from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter
|
|
||||||
from whoosh.index import create_in, open_dir, exists_in
|
|
||||||
from whoosh.fields import *
|
|
||||||
from whoosh.qparser import QueryParser
|
|
||||||
from whoosh.support.charset import accent_map
|
|
||||||
|
|
||||||
analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3)
|
|
||||||
|
|
||||||
title_field = TEXT(analyzer=analyzer, stored=True)
|
|
||||||
id_field = ID(unique=True, stored=True)
|
|
||||||
|
|
||||||
schema = Schema(
|
|
||||||
id=id_field,
|
|
||||||
title=title_field,
|
|
||||||
story=STORED,
|
|
||||||
)
|
|
||||||
|
|
||||||
ARCHIVE_LOCATION = 'data/archive'
|
|
||||||
|
|
||||||
ix = None
|
|
||||||
|
|
||||||
def init():
|
|
||||||
global ix
|
|
||||||
|
|
||||||
if exists_in(ARCHIVE_LOCATION):
|
|
||||||
ix = open_dir(ARCHIVE_LOCATION)
|
|
||||||
else:
|
|
||||||
ix = create_in(ARCHIVE_LOCATION, schema)
|
|
||||||
|
|
||||||
def update(story):
|
|
||||||
writer = ix.writer()
|
|
||||||
writer.update_document(
|
|
||||||
id=story['id'],
|
|
||||||
title=story['title'],
|
|
||||||
story=story,
|
|
||||||
)
|
|
||||||
writer.commit()
|
|
||||||
|
|
||||||
def get_story(sid):
|
|
||||||
with ix.searcher() as searcher:
|
|
||||||
result = searcher.document(id=sid)
|
|
||||||
return result['story'] if result else None
|
|
||||||
|
|
||||||
def search(search):
|
|
||||||
with ix.searcher() as searcher:
|
|
||||||
query = QueryParser('title', ix.schema).parse(search)
|
|
||||||
results = searcher.search(query)
|
|
||||||
stories = [r['story'] for r in results]
|
|
||||||
for s in stories:
|
|
||||||
s.pop('text', '')
|
|
||||||
s.pop('comments', '')
|
|
||||||
return stories
|
|
|
@ -10,7 +10,6 @@ from bs4 import BeautifulSoup
|
||||||
from feeds import hackernews, reddit, tildes, manual
|
from feeds import hackernews, reddit, tildes, manual
|
||||||
|
|
||||||
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
||||||
ARCHIVE_API = 'https://archive.fo/submit/'
|
|
||||||
READ_API = 'http://127.0.0.1:33843'
|
READ_API = 'http://127.0.0.1:33843'
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||||
|
@ -72,8 +71,6 @@ def get_content_type(url):
|
||||||
def update_story(story, is_manual=False):
|
def update_story(story, is_manual=False):
|
||||||
res = {}
|
res = {}
|
||||||
|
|
||||||
logging.info('Updating story ' + str(story['ref']))
|
|
||||||
|
|
||||||
if story['source'] == 'hackernews':
|
if story['source'] == 'hackernews':
|
||||||
res = hackernews.story(story['ref'])
|
res = hackernews.story(story['ref'])
|
||||||
elif story['source'] == 'reddit':
|
elif story['source'] == 'reddit':
|
||||||
|
|
|
@ -25,6 +25,16 @@ def api(route, ref=None):
|
||||||
return r.json()
|
return r.json()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem hitting hackernews API: {}, trying again'.format(str(e)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(route(ref), timeout=15)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
logging.error('Problem hitting hackernews API: {}'.format(str(e)))
|
logging.error('Problem hitting hackernews API: {}'.format(str(e)))
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -1,26 +0,0 @@
|
||||||
import shelve
|
|
||||||
|
|
||||||
import archive
|
|
||||||
|
|
||||||
archive.init()
|
|
||||||
|
|
||||||
#with shelve.open('data/data') as db:
|
|
||||||
# to_delete = []
|
|
||||||
#
|
|
||||||
# for s in db.values():
|
|
||||||
# if 'title' in s:
|
|
||||||
# archive.update(s)
|
|
||||||
# if 'id' in s:
|
|
||||||
# to_delete.append(s['id'])
|
|
||||||
#
|
|
||||||
# for id in to_delete:
|
|
||||||
# del db[id]
|
|
||||||
#
|
|
||||||
# for s in db['news_cache'].values():
|
|
||||||
# if 'title' in s:
|
|
||||||
# archive.update(s)
|
|
||||||
|
|
||||||
#with shelve.open('data/whoosh') as db:
|
|
||||||
# for s in db['news_cache'].values():
|
|
||||||
# if 'title' in s and not archive.get_story(s['id']):
|
|
||||||
# archive.update(s)
|
|
|
@ -1,74 +0,0 @@
|
||||||
import archive
|
|
||||||
import database
|
|
||||||
import search
|
|
||||||
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
|
|
||||||
database.init()
|
|
||||||
archive.init()
|
|
||||||
search.init()
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
|
|
||||||
def database_del_story_by_ref(ref):
|
|
||||||
try:
|
|
||||||
session = database.Session()
|
|
||||||
session.query(database.Story).filter(database.Story.ref==ref).delete()
|
|
||||||
session.commit()
|
|
||||||
except:
|
|
||||||
session.rollback()
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
def search_del_story(sid):
|
|
||||||
try:
|
|
||||||
r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2)
|
|
||||||
if r.status_code != 202:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem deleting MeiliSearch story: {}'.format(str(e)))
|
|
||||||
return False
|
|
||||||
|
|
||||||
with archive.ix.searcher() as searcher:
|
|
||||||
print('count all', searcher.doc_count_all())
|
|
||||||
print('count', searcher.doc_count())
|
|
||||||
|
|
||||||
for doc in searcher.documents():
|
|
||||||
try:
|
|
||||||
print('num', count, 'id', doc['id'])
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
story = doc['story']
|
|
||||||
story.pop('img', None)
|
|
||||||
|
|
||||||
if 'reddit.com/r/technology' in story['link']:
|
|
||||||
print('skipping r/technology')
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
database.put_story(story)
|
|
||||||
except database.IntegrityError:
|
|
||||||
print('collision!')
|
|
||||||
old_story = database.get_story_by_ref(story['ref'])
|
|
||||||
old_story = json.loads(old_story.full_json)
|
|
||||||
if story['num_comments'] > old_story['num_comments']:
|
|
||||||
print('more comments, replacing')
|
|
||||||
database_del_story_by_ref(story['ref'])
|
|
||||||
database.put_story(story)
|
|
||||||
search_del_story(old_story['id'])
|
|
||||||
else:
|
|
||||||
print('fewer comments, skipping')
|
|
||||||
continue
|
|
||||||
|
|
||||||
search.put_story(story)
|
|
||||||
print()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
break
|
|
||||||
except BaseException as e:
|
|
||||||
print('skipping', doc['id'])
|
|
||||||
print('reason:', e)
|
|
|
@ -25,6 +25,5 @@ urllib3==1.25.9
|
||||||
webencodings==0.5.1
|
webencodings==0.5.1
|
||||||
websocket-client==0.57.0
|
websocket-client==0.57.0
|
||||||
Werkzeug==1.0.1
|
Werkzeug==1.0.1
|
||||||
Whoosh==2.7.4
|
|
||||||
zope.event==4.4
|
zope.event==4.4
|
||||||
zope.interface==5.1.0
|
zope.interface==5.1.0
|
||||||
|
|
|
@ -175,6 +175,8 @@ def feed_thread():
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
|
story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
|
||||||
|
|
||||||
|
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
|
||||||
|
|
||||||
valid = feed.update_story(story)
|
valid = feed.update_story(story)
|
||||||
if valid:
|
if valid:
|
||||||
database.put_story(story)
|
database.put_story(story)
|
||||||
|
@ -183,7 +185,7 @@ def feed_thread():
|
||||||
database.del_ref(item['ref'])
|
database.del_ref(item['ref'])
|
||||||
logging.info('Removed ref {}'.format(item['ref']))
|
logging.info('Removed ref {}'.format(item['ref']))
|
||||||
else:
|
else:
|
||||||
logging.info('Skipping index')
|
logging.info('Skipping index: ' + str(news_index))
|
||||||
|
|
||||||
gevent.sleep(6)
|
gevent.sleep(6)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user