parent
cbc802b7e9
commit
e59acefda9
5 changed files with 0 additions and 153 deletions
@ -1,52 +0,0 @@ |
||||
from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter |
||||
from whoosh.index import create_in, open_dir, exists_in |
||||
from whoosh.fields import * |
||||
from whoosh.qparser import QueryParser |
||||
from whoosh.support.charset import accent_map |
||||
|
||||
analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3) |
||||
|
||||
title_field = TEXT(analyzer=analyzer, stored=True) |
||||
id_field = ID(unique=True, stored=True) |
||||
|
||||
schema = Schema( |
||||
id=id_field, |
||||
title=title_field, |
||||
story=STORED, |
||||
) |
||||
|
||||
ARCHIVE_LOCATION = 'data/archive' |
||||
|
||||
ix = None |
||||
|
||||
def init(): |
||||
global ix |
||||
|
||||
if exists_in(ARCHIVE_LOCATION): |
||||
ix = open_dir(ARCHIVE_LOCATION) |
||||
else: |
||||
ix = create_in(ARCHIVE_LOCATION, schema) |
||||
|
||||
def update(story): |
||||
writer = ix.writer() |
||||
writer.update_document( |
||||
id=story['id'], |
||||
title=story['title'], |
||||
story=story, |
||||
) |
||||
writer.commit() |
||||
|
||||
def get_story(sid): |
||||
with ix.searcher() as searcher: |
||||
result = searcher.document(id=sid) |
||||
return result['story'] if result else None |
||||
|
||||
def search(search): |
||||
with ix.searcher() as searcher: |
||||
query = QueryParser('title', ix.schema).parse(search) |
||||
results = searcher.search(query) |
||||
stories = [r['story'] for r in results] |
||||
for s in stories: |
||||
s.pop('text', '') |
||||
s.pop('comments', '') |
||||
return stories |
@ -1,26 +0,0 @@ |
||||
import shelve |
||||
|
||||
import archive |
||||
|
||||
archive.init() |
||||
|
||||
#with shelve.open('data/data') as db: |
||||
# to_delete = [] |
||||
# |
||||
# for s in db.values(): |
||||
# if 'title' in s: |
||||
# archive.update(s) |
||||
# if 'id' in s: |
||||
# to_delete.append(s['id']) |
||||
# |
||||
# for id in to_delete: |
||||
# del db[id] |
||||
# |
||||
# for s in db['news_cache'].values(): |
||||
# if 'title' in s: |
||||
# archive.update(s) |
||||
|
||||
#with shelve.open('data/whoosh') as db: |
||||
# for s in db['news_cache'].values(): |
||||
# if 'title' in s and not archive.get_story(s['id']): |
||||
# archive.update(s) |
@ -1,74 +0,0 @@ |
||||
import archive |
||||
import database |
||||
import search |
||||
|
||||
import json |
||||
import requests |
||||
|
||||
database.init() |
||||
archive.init() |
||||
search.init() |
||||
|
||||
count = 0 |
||||
|
||||
def database_del_story_by_ref(ref): |
||||
try: |
||||
session = database.Session() |
||||
session.query(database.Story).filter(database.Story.ref==ref).delete() |
||||
session.commit() |
||||
except: |
||||
session.rollback() |
||||
raise |
||||
finally: |
||||
session.close() |
||||
|
||||
def search_del_story(sid): |
||||
try: |
||||
r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2) |
||||
if r.status_code != 202: |
||||
raise Exception('Bad response code ' + str(r.status_code)) |
||||
return r.json() |
||||
except KeyboardInterrupt: |
||||
raise |
||||
except BaseException as e: |
||||
logging.error('Problem deleting MeiliSearch story: {}'.format(str(e))) |
||||
return False |
||||
|
||||
with archive.ix.searcher() as searcher: |
||||
print('count all', searcher.doc_count_all()) |
||||
print('count', searcher.doc_count()) |
||||
|
||||
for doc in searcher.documents(): |
||||
try: |
||||
print('num', count, 'id', doc['id']) |
||||
count += 1 |
||||
|
||||
story = doc['story'] |
||||
story.pop('img', None) |
||||
|
||||
if 'reddit.com/r/technology' in story['link']: |
||||
print('skipping r/technology') |
||||
continue |
||||
|
||||
try: |
||||
database.put_story(story) |
||||
except database.IntegrityError: |
||||
print('collision!') |
||||
old_story = database.get_story_by_ref(story['ref']) |
||||
old_story = json.loads(old_story.full_json) |
||||
if story['num_comments'] > old_story['num_comments']: |
||||
print('more comments, replacing') |
||||
database_del_story_by_ref(story['ref']) |
||||
database.put_story(story) |
||||
search_del_story(old_story['id']) |
||||
else: |
||||
print('fewer comments, skipping') |
||||
continue |
||||
|
||||
search.put_story(story) |
||||
print() |
||||
except KeyboardInterrupt: |
||||
break |
||||
except BaseException as e: |
||||
print('skipping', doc['id']) |
||||
print('reason:', e) |
Loading…
Reference in new issue