From 7cb87b59fe26c10a47c1bae266770f28efb38824 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Sat, 12 Oct 2019 05:32:17 +0000 Subject: [PATCH] Move archive to Whoosh and add search --- apiserver/.gitignore | 2 + apiserver/archive.py | 52 +++++++++++++++++ apiserver/migrate-shelve-to-whoosh.py | 26 +++++++++ apiserver/server.py | 56 ++++++++++-------- webclient/package.json | 1 + webclient/src/App.js | 11 +++- webclient/src/Article.js | 24 ++++---- webclient/src/Comments.js | 24 ++++---- webclient/src/Feed.js | 27 +++++---- webclient/src/Results.js | 83 +++++++++++++++++++++++++++ webclient/src/Search.js | 53 +++++++++++++++++ webclient/src/Style-dark.css | 5 ++ webclient/src/Style-light.css | 8 +++ webclient/yarn.lock | 19 ++++++ 14 files changed, 328 insertions(+), 63 deletions(-) create mode 100644 apiserver/archive.py create mode 100644 apiserver/migrate-shelve-to-whoosh.py create mode 100644 webclient/src/Results.js create mode 100644 webclient/src/Search.js diff --git a/apiserver/.gitignore b/apiserver/.gitignore index f4e05cc..dde3e1e 100644 --- a/apiserver/.gitignore +++ b/apiserver/.gitignore @@ -107,3 +107,5 @@ db.sqlite3 praw.ini data.db +data.db.bak +data/archive/* diff --git a/apiserver/archive.py b/apiserver/archive.py new file mode 100644 index 0000000..ca2e76b --- /dev/null +++ b/apiserver/archive.py @@ -0,0 +1,52 @@ +from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter +from whoosh.index import create_in, open_dir, exists_in +from whoosh.fields import * +from whoosh.qparser import QueryParser +from whoosh.support.charset import accent_map + +analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3) + +title_field = TEXT(analyzer=analyzer, stored=True) +id_field = ID(unique=True, stored=True) + +schema = Schema( + id=id_field, + title=title_field, + story=STORED, + ) + +ARCHIVE_LOCATION = 'data/archive' + +ix = None + +def init(): + global ix + + if exists_in(ARCHIVE_LOCATION): + ix = open_dir(ARCHIVE_LOCATION) + else: + ix = create_in(ARCHIVE_LOCATION, schema) + +def update(story): + writer = ix.writer() + writer.update_document( + id=story['id'], + title=story['title'], + story=story, + ) + writer.commit() + +def get_story(id): + with ix.searcher() as searcher: + result = searcher.document(id=id) + return result['story'] if result else None + +def search(search): + with ix.searcher() as searcher: + query = QueryParser('title', ix.schema).parse(search) + results = searcher.search(query) + stories = [r['story'] for r in results] + for s in stories: + s.pop('text', '') + s.pop('comments', '') + return stories diff --git a/apiserver/migrate-shelve-to-whoosh.py b/apiserver/migrate-shelve-to-whoosh.py new file mode 100644 index 0000000..4bfae32 --- /dev/null +++ b/apiserver/migrate-shelve-to-whoosh.py @@ -0,0 +1,26 @@ +import shelve + +import archive + +archive.init() + +#with shelve.open('data/data') as db: +# to_delete = [] +# +# for s in db.values(): +# if 'title' in s: +# archive.update(s) +# if 'id' in s: +# to_delete.append(s['id']) +# +# for id in to_delete: +# del db[id] +# +# for s in db['news_cache'].values(): +# if 'title' in s: +# archive.update(s) + +#with shelve.open('data/whoosh') as db: +# for s in db['news_cache'].values(): +# if 'title' in s and not archive.get_story(s['id']): +# archive.update(s) diff --git a/apiserver/server.py b/apiserver/server.py index d445bbf..3c1ac09 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -9,6 +9,7 @@ import time import shelve from urllib.parse import urlparse +import archive import feed from utils import gen_rand_id @@ -16,6 +17,8 @@ from flask import abort, Flask, request, render_template from werkzeug.exceptions import NotFound from flask_cors import CORS +archive.init() + CACHE_LENGTH = 300 DATA_FILE = 'data/data' @@ -29,11 +32,9 @@ with shelve.open(DATA_FILE) as db: def get_story(id): if id in news_cache: - return {'story': news_cache[id]} - with shelve.open(DATA_FILE) as db: - if id in db: - return {'story': db[id]} - return None + return news_cache[id] + else: + return archive.get_story(id) build_folder = '../webclient/build' flask_app = Flask(__name__, template_folder=build_folder, static_folder=build_folder, static_url_path='') @@ -42,15 +43,26 @@ cors = CORS(flask_app) @flask_app.route('/api') def api(): front_page = [news_cache[news_ref_to_id[ref]] for ref in news_list] - front_page = [copy.copy(x) for x in front_page if 'text' in x and x['text']][:100] - for story in front_page: - if 'comments' in story: story.pop('comments') - if 'text' in story: story.pop('text') + front_page = [x for x in front_page if 'title' in x and x['title']] + front_page = front_page[:100] + to_remove = ['text', 'comments'] + front_page = [{k:v for k,v in s.items() if k not in to_remove} for s in front_page] + return {'stories': front_page} +@flask_app.route('/api/search', strict_slashes=False) +def search(): + search = request.args.get('q', '') + if len(search) >= 3: + res = archive.search(search) + else: + res = [] + return {'results': res} + @flask_app.route('/api/') def story(id): - return get_story(id) or abort(404) + story = get_story(id) + return dict(story=story) if story else abort(404) @flask_app.route('/') def index(): @@ -68,10 +80,7 @@ def static_story(id): pass story = get_story(id) - if story: - story = story['story'] - else: - return abort(404) + if not story: return abort(404) score = story['score'] num_comments = story['num_comments'] @@ -94,23 +103,20 @@ web_thread.start() def new_id(): nid = gen_rand_id() - with shelve.open(DATA_FILE) as db: - while nid in news_cache or nid in db: - nid = gen_rand_id() + while nid in news_cache or archive.get_story(nid): + nid = gen_rand_id() return nid -def remove_ref(old_ref, archive=False): +def remove_ref(old_ref): while old_ref in news_list: news_list.remove(old_ref) old_story = news_cache.pop(news_ref_to_id[old_ref]) old_id = news_ref_to_id.pop(old_ref) logging.info('Removed ref {} id {}.'.format(old_ref, old_id)) - if archive: - with shelve.open(DATA_FILE) as db: - db[old_id] = old_story try: while True: + # onboard new stories if news_index == 0: feed_list = feed.list() new_items = [(ref, source) for ref, source in feed_list if ref not in news_list] @@ -123,16 +129,20 @@ try: if len(new_items): logging.info('Added {} new refs.'.format(len(new_items))) + # drop old ones while len(news_list) > CACHE_LENGTH: old_ref = news_list[-1] - remove_ref(old_ref, archive=True) + remove_ref(old_ref) + # update current stories if news_index < len(news_list): update_ref = news_list[news_index] update_id = news_ref_to_id[update_ref] news_story = news_cache[update_id] valid = feed.update_story(news_story) - if not valid: + if valid: + archive.update(news_story) + else: remove_ref(update_ref) time.sleep(3) diff --git a/webclient/package.json b/webclient/package.json index 104b392..9a0b3a3 100644 --- a/webclient/package.json +++ b/webclient/package.json @@ -4,6 +4,7 @@ "private": true, "dependencies": { "moment": "^2.24.0", + "query-string": "^6.8.3", "react": "^16.9.0", "react-dom": "^16.9.0", "react-helmet": "^5.2.1", diff --git a/webclient/src/App.js b/webclient/src/App.js index 05d5d27..1363295 100644 --- a/webclient/src/App.js +++ b/webclient/src/App.js @@ -1,11 +1,13 @@ import React from 'react'; -import { BrowserRouter as Router, Route, Link } from 'react-router-dom'; +import { BrowserRouter as Router, Route, Link, Switch } from 'react-router-dom'; import './Style-light.css'; import './Style-dark.css'; import './fonts/Fonts.css'; import Feed from './Feed.js'; import Article from './Article.js'; import Comments from './Comments.js'; +import Search from './Search.js'; +import Results from './Results.js'; import ScrollToTop from './ScrollToTop.js'; class App extends React.Component { @@ -41,10 +43,15 @@ class App extends React.Component {
Reddit, Hacker News, and Tildes combined, then pre-rendered in reader mode.

+ + + + + + - diff --git a/webclient/src/Article.js b/webclient/src/Article.js index 4b30026..6d9e3a1 100644 --- a/webclient/src/Article.js +++ b/webclient/src/Article.js @@ -15,20 +15,20 @@ class Article extends React.Component { }; } - componentDidMount() { + componentDidMount() { const id = this.props.match.params.id; - fetch('/api/' + id) - .then(res => res.json()) - .then( - (result) => { - this.setState({ story: result.story }); - localStorage.setItem(id, JSON.stringify(result.story)); - }, - (error) => { - this.setState({ error: true }); - } - ); + fetch('/api/' + id) + .then(res => res.json()) + .then( + (result) => { + this.setState({ story: result.story }); + localStorage.setItem(id, JSON.stringify(result.story)); + }, + (error) => { + this.setState({ error: true }); + } + ); } render() { diff --git a/webclient/src/Comments.js b/webclient/src/Comments.js index ca53000..8bb98ac 100644 --- a/webclient/src/Comments.js +++ b/webclient/src/Comments.js @@ -17,25 +17,25 @@ class Article extends React.Component { }; } - componentDidMount() { + componentDidMount() { const id = this.props.match.params.id; - fetch('/api/' + id) - .then(res => res.json()) - .then( - (result) => { - localStorage.setItem(id, JSON.stringify(result.story)); - this.setState({ story: result.story }, () => { + fetch('/api/' + id) + .then(res => res.json()) + .then( + (result) => { + localStorage.setItem(id, JSON.stringify(result.story)); + this.setState({ story: result.story }, () => { const hash = window.location.hash.substring(1); if (hash) { document.getElementById(hash).scrollIntoView(); } }); - }, - (error) => { - this.setState({ error: true }); - } - ); + }, + (error) => { + this.setState({ error: true }); + } + ); } displayComment(story, c, level) { diff --git a/webclient/src/Feed.js b/webclient/src/Feed.js index 2a77fd4..7da5e9c 100644 --- a/webclient/src/Feed.js +++ b/webclient/src/Feed.js @@ -14,14 +14,14 @@ class Feed extends React.Component { }; } - componentDidMount() { - fetch('/api') - .then(res => res.json()) - .then( - (result) => { - this.setState({ stories: result.stories }); + componentDidMount() { + fetch('/api') + .then(res => res.json()) + .then( + (result) => { + this.setState({ stories: result.stories }); clearStorage(); - localStorage.setItem('stories', JSON.stringify(result.stories)); + localStorage.setItem('stories', JSON.stringify(result.stories)); result.stories.filter(x => x.score >= 20).slice(0, 25).forEach(x => { fetch('/api/' + x.id) .then(res => res.json()) @@ -31,11 +31,11 @@ class Feed extends React.Component { }, error => {} ); }); - }, - (error) => { - this.setState({ error: true }); - } - ); + }, + (error) => { + this.setState({ error: true }); + } + ); } render() { @@ -46,13 +46,12 @@ class Feed extends React.Component {
Feed - QotNews - {error &&

Connection error?

} {stories ?
{stories.map((x, i) => -
+
{i+1}.
diff --git a/webclient/src/Results.js b/webclient/src/Results.js new file mode 100644 index 0000000..f643481 --- /dev/null +++ b/webclient/src/Results.js @@ -0,0 +1,83 @@ +import React from 'react'; +import { Link } from 'react-router-dom'; +import { Helmet } from 'react-helmet'; +import { siteLogo, sourceLink, infoLine } from './utils.js'; + +class Results extends React.Component { + constructor(props) { + super(props); + + this.state = { + stories: false, + error: false, + }; + + } + + performSearch = () => { + const search = this.props.location.search; + fetch('/api/search' + search) + .then(res => res.json()) + .then( + (result) => { + this.setState({ stories: result.results }); + }, + (error) => { + this.setState({ error: true }); + } + ); + } + + componentDidMount() { + this.performSearch(); + } + + componentDidUpdate(prevProps) { + if (this.props.location.search !== prevProps.location.search) { + this.performSearch(); + } + } + + render() { + const stories = this.state.stories; + const error = this.state.error; + + return ( +
+ + Feed - QotNews + + {error &&

Connection error?

} + {stories ? +
+ {stories.length ? + stories.map((x, i) => +
+
+ {i+1}. +
+ +
+ {siteLogo[x.source]} {x.title} + + + ​({sourceLink(x)}) + +
+ + {infoLine(x)} +
+ ) + : +

no results

+ } +
+ : +

loading...

+ } +
+ ); + } +} + +export default Results; diff --git a/webclient/src/Search.js b/webclient/src/Search.js new file mode 100644 index 0000000..52cbbe4 --- /dev/null +++ b/webclient/src/Search.js @@ -0,0 +1,53 @@ +import React, { Component } from 'react'; +import { withRouter } from 'react-router-dom'; +import queryString from 'query-string'; + +const getSearch = props => queryString.parse(props.location.search).q; + +class Search extends Component { + constructor(props) { + super(props); + + this.state = {search: getSearch(this.props)}; + this.inputRef = React.createRef(); + } + + searchArticles = (event) => { + const search = event.target.value; + this.setState({search: search}); + if (search.length >= 3) { + const searchQuery = queryString.stringify({ 'q': search }); + this.props.history.replace('/search?' + searchQuery); + } else { + this.props.history.replace('/'); + } + } + + searchAgain = (event) => { + event.preventDefault(); + const searchString = queryString.stringify({ 'q': event.target[0].value }); + this.props.history.push('/search?' + searchString); + this.inputRef.current.blur(); + } + + render() { + const search = this.state.search; + + return ( +
+
+
+ +
+
+
+ ); + } +} + +export default withRouter(Search); diff --git a/webclient/src/Style-dark.css b/webclient/src/Style-dark.css index 5b8b8db..4542a97 100644 --- a/webclient/src/Style-dark.css +++ b/webclient/src/Style-dark.css @@ -6,6 +6,11 @@ color: #ddd; } +.dark input { + color: #ddd; + border: 1px solid #828282; +} + .dark .item { color: #828282; } diff --git a/webclient/src/Style-light.css b/webclient/src/Style-light.css index 48cc39c..d29804f 100644 --- a/webclient/src/Style-light.css +++ b/webclient/src/Style-light.css @@ -11,6 +11,14 @@ a { outline: none; } +input { + font-size: 1.05rem; + background-color: transparent; + border: 1px solid #828282; + padding: 6px; + border-radius: 4px; +} + .container { margin: 1rem auto; max-width: 64rem; diff --git a/webclient/yarn.lock b/webclient/yarn.lock index 0912679..25c2558 100644 --- a/webclient/yarn.lock +++ b/webclient/yarn.lock @@ -7817,6 +7817,15 @@ qs@~6.5.2: resolved "https://registry.yarnpkg.com/qs/-/qs-6.5.2.tgz#cb3ae806e8740444584ef154ce8ee98d403f3e36" integrity sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA== +query-string@^6.8.3: + version "6.8.3" + resolved "https://registry.yarnpkg.com/query-string/-/query-string-6.8.3.tgz#fd9fb7ffb068b79062b43383685611ee47777d4b" + integrity sha512-llcxWccnyaWlODe7A9hRjkvdCKamEKTh+wH8ITdTc3OhchaqUZteiSCX/2ablWHVrkVIe04dntnaZJ7BdyW0lQ== + dependencies: + decode-uri-component "^0.2.0" + split-on-first "^1.0.0" + strict-uri-encode "^2.0.0" + querystring-es3@^0.2.0: version "0.2.1" resolved "https://registry.yarnpkg.com/querystring-es3/-/querystring-es3-0.2.1.tgz#9ec61f79049875707d69414596fd907a4d711e73" @@ -8878,6 +8887,11 @@ spdy@^4.0.0: select-hose "^2.0.0" spdy-transport "^3.0.0" +split-on-first@^1.0.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f" + integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw== + split-string@^3.0.1, split-string@^3.0.2: version "3.1.0" resolved "https://registry.yarnpkg.com/split-string/-/split-string-3.1.0.tgz#7cb09dda3a86585705c64b39a6466038682e8fe2" @@ -8972,6 +8986,11 @@ stream-shift@^1.0.0: resolved "https://registry.yarnpkg.com/stream-shift/-/stream-shift-1.0.0.tgz#d5c752825e5367e786f78e18e445ea223a155952" integrity sha1-1cdSgl5TZ+eG944Y5EXqIjoVWVI= +strict-uri-encode@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/strict-uri-encode/-/strict-uri-encode-2.0.0.tgz#b9c7330c7042862f6b142dc274bbcc5866ce3546" + integrity sha1-ucczDHBChi9rFC3CdLvMWGbONUY= + string-length@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/string-length/-/string-length-2.0.0.tgz#d40dbb686a3ace960c1cffca562bf2c45f8363ed"