Compare commits

...

3 Commits

Author SHA1 Message Date
e59acefda9 Remove Whoosh 2020-11-02 00:22:40 +00:00
cbc802b7e9 Try Hackernews API twice 2020-11-02 00:17:22 +00:00
4579dfce00 Improve logging 2020-11-02 00:13:43 +00:00
8 changed files with 13 additions and 157 deletions

View File

@ -1,52 +0,0 @@
from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter
from whoosh.index import create_in, open_dir, exists_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.support.charset import accent_map
analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3)
title_field = TEXT(analyzer=analyzer, stored=True)
id_field = ID(unique=True, stored=True)
schema = Schema(
id=id_field,
title=title_field,
story=STORED,
)
ARCHIVE_LOCATION = 'data/archive'
ix = None
def init():
global ix
if exists_in(ARCHIVE_LOCATION):
ix = open_dir(ARCHIVE_LOCATION)
else:
ix = create_in(ARCHIVE_LOCATION, schema)
def update(story):
writer = ix.writer()
writer.update_document(
id=story['id'],
title=story['title'],
story=story,
)
writer.commit()
def get_story(sid):
with ix.searcher() as searcher:
result = searcher.document(id=sid)
return result['story'] if result else None
def search(search):
with ix.searcher() as searcher:
query = QueryParser('title', ix.schema).parse(search)
results = searcher.search(query)
stories = [r['story'] for r in results]
for s in stories:
s.pop('text', '')
s.pop('comments', '')
return stories

View File

@ -10,7 +10,6 @@ from bs4 import BeautifulSoup
from feeds import hackernews, reddit, tildes, manual from feeds import hackernews, reddit, tildes, manual
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
ARCHIVE_API = 'https://archive.fo/submit/'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -72,8 +71,6 @@ def get_content_type(url):
def update_story(story, is_manual=False): def update_story(story, is_manual=False):
res = {} res = {}
logging.info('Updating story ' + str(story['ref']))
if story['source'] == 'hackernews': if story['source'] == 'hackernews':
res = hackernews.story(story['ref']) res = hackernews.story(story['ref'])
elif story['source'] == 'reddit': elif story['source'] == 'reddit':

View File

@ -25,6 +25,16 @@ def api(route, ref=None):
return r.json() return r.json()
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e:
logging.error('Problem hitting hackernews API: {}, trying again'.format(str(e)))
try:
r = requests.get(route(ref), timeout=15)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e: except BaseException as e:
logging.error('Problem hitting hackernews API: {}'.format(str(e))) logging.error('Problem hitting hackernews API: {}'.format(str(e)))
return False return False

View File

@ -1,26 +0,0 @@
import shelve
import archive
archive.init()
#with shelve.open('data/data') as db:
# to_delete = []
#
# for s in db.values():
# if 'title' in s:
# archive.update(s)
# if 'id' in s:
# to_delete.append(s['id'])
#
# for id in to_delete:
# del db[id]
#
# for s in db['news_cache'].values():
# if 'title' in s:
# archive.update(s)
#with shelve.open('data/whoosh') as db:
# for s in db['news_cache'].values():
# if 'title' in s and not archive.get_story(s['id']):
# archive.update(s)

View File

@ -1,74 +0,0 @@
import archive
import database
import search
import json
import requests
database.init()
archive.init()
search.init()
count = 0
def database_del_story_by_ref(ref):
try:
session = database.Session()
session.query(database.Story).filter(database.Story.ref==ref).delete()
session.commit()
except:
session.rollback()
raise
finally:
session.close()
def search_del_story(sid):
try:
r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2)
if r.status_code != 202:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem deleting MeiliSearch story: {}'.format(str(e)))
return False
with archive.ix.searcher() as searcher:
print('count all', searcher.doc_count_all())
print('count', searcher.doc_count())
for doc in searcher.documents():
try:
print('num', count, 'id', doc['id'])
count += 1
story = doc['story']
story.pop('img', None)
if 'reddit.com/r/technology' in story['link']:
print('skipping r/technology')
continue
try:
database.put_story(story)
except database.IntegrityError:
print('collision!')
old_story = database.get_story_by_ref(story['ref'])
old_story = json.loads(old_story.full_json)
if story['num_comments'] > old_story['num_comments']:
print('more comments, replacing')
database_del_story_by_ref(story['ref'])
database.put_story(story)
search_del_story(old_story['id'])
else:
print('fewer comments, skipping')
continue
search.put_story(story)
print()
except KeyboardInterrupt:
break
except BaseException as e:
print('skipping', doc['id'])
print('reason:', e)

View File

@ -25,6 +25,5 @@ urllib3==1.25.9
webencodings==0.5.1 webencodings==0.5.1
websocket-client==0.57.0 websocket-client==0.57.0
Werkzeug==1.0.1 Werkzeug==1.0.1
Whoosh==2.7.4
zope.event==4.4 zope.event==4.4
zope.interface==5.1.0 zope.interface==5.1.0

View File

@ -175,6 +175,8 @@ def feed_thread():
except AttributeError: except AttributeError:
story = dict(id=item['sid'], ref=item['ref'], source=item['source']) story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
valid = feed.update_story(story) valid = feed.update_story(story)
if valid: if valid:
database.put_story(story) database.put_story(story)
@ -183,7 +185,7 @@ def feed_thread():
database.del_ref(item['ref']) database.del_ref(item['ref'])
logging.info('Removed ref {}'.format(item['ref'])) logging.info('Removed ref {}'.format(item['ref']))
else: else:
logging.info('Skipping index') logging.info('Skipping index: ' + str(news_index))
gevent.sleep(6) gevent.sleep(6)