parent
cbc802b7e9
commit
e59acefda9
5 changed files with 0 additions and 153 deletions
@ -1,52 +0,0 @@ |
|||||||
from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter |
|
||||||
from whoosh.index import create_in, open_dir, exists_in |
|
||||||
from whoosh.fields import * |
|
||||||
from whoosh.qparser import QueryParser |
|
||||||
from whoosh.support.charset import accent_map |
|
||||||
|
|
||||||
analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3) |
|
||||||
|
|
||||||
title_field = TEXT(analyzer=analyzer, stored=True) |
|
||||||
id_field = ID(unique=True, stored=True) |
|
||||||
|
|
||||||
schema = Schema( |
|
||||||
id=id_field, |
|
||||||
title=title_field, |
|
||||||
story=STORED, |
|
||||||
) |
|
||||||
|
|
||||||
ARCHIVE_LOCATION = 'data/archive' |
|
||||||
|
|
||||||
ix = None |
|
||||||
|
|
||||||
def init(): |
|
||||||
global ix |
|
||||||
|
|
||||||
if exists_in(ARCHIVE_LOCATION): |
|
||||||
ix = open_dir(ARCHIVE_LOCATION) |
|
||||||
else: |
|
||||||
ix = create_in(ARCHIVE_LOCATION, schema) |
|
||||||
|
|
||||||
def update(story): |
|
||||||
writer = ix.writer() |
|
||||||
writer.update_document( |
|
||||||
id=story['id'], |
|
||||||
title=story['title'], |
|
||||||
story=story, |
|
||||||
) |
|
||||||
writer.commit() |
|
||||||
|
|
||||||
def get_story(sid): |
|
||||||
with ix.searcher() as searcher: |
|
||||||
result = searcher.document(id=sid) |
|
||||||
return result['story'] if result else None |
|
||||||
|
|
||||||
def search(search): |
|
||||||
with ix.searcher() as searcher: |
|
||||||
query = QueryParser('title', ix.schema).parse(search) |
|
||||||
results = searcher.search(query) |
|
||||||
stories = [r['story'] for r in results] |
|
||||||
for s in stories: |
|
||||||
s.pop('text', '') |
|
||||||
s.pop('comments', '') |
|
||||||
return stories |
|
@ -1,26 +0,0 @@ |
|||||||
import shelve |
|
||||||
|
|
||||||
import archive |
|
||||||
|
|
||||||
archive.init() |
|
||||||
|
|
||||||
#with shelve.open('data/data') as db: |
|
||||||
# to_delete = [] |
|
||||||
# |
|
||||||
# for s in db.values(): |
|
||||||
# if 'title' in s: |
|
||||||
# archive.update(s) |
|
||||||
# if 'id' in s: |
|
||||||
# to_delete.append(s['id']) |
|
||||||
# |
|
||||||
# for id in to_delete: |
|
||||||
# del db[id] |
|
||||||
# |
|
||||||
# for s in db['news_cache'].values(): |
|
||||||
# if 'title' in s: |
|
||||||
# archive.update(s) |
|
||||||
|
|
||||||
#with shelve.open('data/whoosh') as db: |
|
||||||
# for s in db['news_cache'].values(): |
|
||||||
# if 'title' in s and not archive.get_story(s['id']): |
|
||||||
# archive.update(s) |
|
@ -1,74 +0,0 @@ |
|||||||
import archive |
|
||||||
import database |
|
||||||
import search |
|
||||||
|
|
||||||
import json |
|
||||||
import requests |
|
||||||
|
|
||||||
database.init() |
|
||||||
archive.init() |
|
||||||
search.init() |
|
||||||
|
|
||||||
count = 0 |
|
||||||
|
|
||||||
def database_del_story_by_ref(ref): |
|
||||||
try: |
|
||||||
session = database.Session() |
|
||||||
session.query(database.Story).filter(database.Story.ref==ref).delete() |
|
||||||
session.commit() |
|
||||||
except: |
|
||||||
session.rollback() |
|
||||||
raise |
|
||||||
finally: |
|
||||||
session.close() |
|
||||||
|
|
||||||
def search_del_story(sid): |
|
||||||
try: |
|
||||||
r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2) |
|
||||||
if r.status_code != 202: |
|
||||||
raise Exception('Bad response code ' + str(r.status_code)) |
|
||||||
return r.json() |
|
||||||
except KeyboardInterrupt: |
|
||||||
raise |
|
||||||
except BaseException as e: |
|
||||||
logging.error('Problem deleting MeiliSearch story: {}'.format(str(e))) |
|
||||||
return False |
|
||||||
|
|
||||||
with archive.ix.searcher() as searcher: |
|
||||||
print('count all', searcher.doc_count_all()) |
|
||||||
print('count', searcher.doc_count()) |
|
||||||
|
|
||||||
for doc in searcher.documents(): |
|
||||||
try: |
|
||||||
print('num', count, 'id', doc['id']) |
|
||||||
count += 1 |
|
||||||
|
|
||||||
story = doc['story'] |
|
||||||
story.pop('img', None) |
|
||||||
|
|
||||||
if 'reddit.com/r/technology' in story['link']: |
|
||||||
print('skipping r/technology') |
|
||||||
continue |
|
||||||
|
|
||||||
try: |
|
||||||
database.put_story(story) |
|
||||||
except database.IntegrityError: |
|
||||||
print('collision!') |
|
||||||
old_story = database.get_story_by_ref(story['ref']) |
|
||||||
old_story = json.loads(old_story.full_json) |
|
||||||
if story['num_comments'] > old_story['num_comments']: |
|
||||||
print('more comments, replacing') |
|
||||||
database_del_story_by_ref(story['ref']) |
|
||||||
database.put_story(story) |
|
||||||
search_del_story(old_story['id']) |
|
||||||
else: |
|
||||||
print('fewer comments, skipping') |
|
||||||
continue |
|
||||||
|
|
||||||
search.put_story(story) |
|
||||||
print() |
|
||||||
except KeyboardInterrupt: |
|
||||||
break |
|
||||||
except BaseException as e: |
|
||||||
print('skipping', doc['id']) |
|
||||||
print('reason:', e) |
|
Loading…
Reference in new issue