Fixed an issue where results weren't returned as they should
This commit is contained in:
@@ -1,58 +1,18 @@
|
||||
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
|
||||
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
|
||||
import {
|
||||
BRACKETS_AND_SPACE,
|
||||
chsRegex,
|
||||
getChsSegmenter,
|
||||
SPACE_OR_PUNCTUATION,
|
||||
} from '../globals'
|
||||
|
||||
import { settings } from '../settings'
|
||||
import {
|
||||
chunkArray,
|
||||
logDebug,
|
||||
removeDiacritics,
|
||||
splitCamelCase,
|
||||
splitHyphens,
|
||||
} from '../tools/utils'
|
||||
import { chunkArray, logDebug, removeDiacritics } from '../tools/utils'
|
||||
import { Notice } from 'obsidian'
|
||||
import type { Query } from './query'
|
||||
import { cacheManager } from '../cache-manager'
|
||||
import { sortBy } from 'lodash-es'
|
||||
import { getMatches, stringsToRegex } from 'src/tools/text-processing'
|
||||
|
||||
const tokenize = (text: string): string[] => {
|
||||
const words = text.split(BRACKETS_AND_SPACE)
|
||||
|
||||
let tokens = text.split(SPACE_OR_PUNCTUATION)
|
||||
|
||||
// Split hyphenated tokens
|
||||
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
||||
|
||||
// Split camelCase tokens into "camel" and "case
|
||||
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
||||
|
||||
// Add whole words (aka "not tokens")
|
||||
tokens = [...tokens, ...words]
|
||||
|
||||
// When enabled, we only use the chsSegmenter,
|
||||
// and not the other custom tokenizers
|
||||
const chsSegmenter = getChsSegmenter()
|
||||
if (chsSegmenter) {
|
||||
const chs = tokens.flatMap(word =>
|
||||
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
||||
)
|
||||
tokens = [...tokens, ...chs]
|
||||
}
|
||||
|
||||
// Remove duplicates
|
||||
tokens = [...new Set(tokens)]
|
||||
|
||||
return tokens
|
||||
}
|
||||
import { tokenizeForIndexing, tokenizeForSearch } from './tokenizer'
|
||||
|
||||
export class Omnisearch {
|
||||
public static readonly options: Options<IndexedDocument> = {
|
||||
tokenize,
|
||||
tokenize: tokenizeForIndexing,
|
||||
extractField: (doc, fieldName) => {
|
||||
if (fieldName === 'directory') {
|
||||
// return path without the filename
|
||||
@@ -212,14 +172,13 @@ export class Omnisearch {
|
||||
break
|
||||
}
|
||||
|
||||
let results = this.minisearch.search(query.segmentsToStr(), {
|
||||
let results = this.minisearch.search(tokenizeForSearch(query.segmentsToStr()), {
|
||||
prefix: term => term.length >= options.prefixLength,
|
||||
// length <= 3: no fuzziness
|
||||
// length <= 5: fuzziness of 10%
|
||||
// length > 5: fuzziness of 20%
|
||||
fuzzy: term =>
|
||||
term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
|
||||
combineWith: 'AND',
|
||||
boost: {
|
||||
basename: settings.weightBasename,
|
||||
directory: settings.weightDirectory,
|
||||
|
||||
73
src/search/tokenizer.ts
Normal file
73
src/search/tokenizer.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
import type { QueryCombination } from 'minisearch'
|
||||
import {
|
||||
BRACKETS_AND_SPACE,
|
||||
SPACE_OR_PUNCTUATION,
|
||||
chsRegex,
|
||||
getChsSegmenter,
|
||||
} from 'src/globals'
|
||||
import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
|
||||
|
||||
function tokenizeWords(text: string): string[] {
|
||||
return text.split(BRACKETS_AND_SPACE)
|
||||
}
|
||||
|
||||
function tokenizeTokens(text: string): string[] {
|
||||
return text.split(SPACE_OR_PUNCTUATION)
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenization for indexing will possibly return more tokens than the original text.
|
||||
* This is because we combine different methods of tokenization to get the best results.
|
||||
* @param text
|
||||
* @returns
|
||||
*/
|
||||
export function tokenizeForIndexing(text: string): string[] {
|
||||
const words = tokenizeWords(text)
|
||||
|
||||
let tokens = tokenizeTokens(text)
|
||||
|
||||
// Split hyphenated tokens
|
||||
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
||||
|
||||
// Split camelCase tokens into "camel" and "case
|
||||
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
||||
|
||||
// Add whole words (aka "not tokens")
|
||||
tokens = [...tokens, ...words]
|
||||
|
||||
// When enabled, we only use the chsSegmenter,
|
||||
// and not the other custom tokenizers
|
||||
const chsSegmenter = getChsSegmenter()
|
||||
if (chsSegmenter) {
|
||||
const chs = tokens.flatMap(word =>
|
||||
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
||||
)
|
||||
tokens = [...tokens, ...chs]
|
||||
}
|
||||
|
||||
// Remove duplicates
|
||||
tokens = [...new Set(tokens)]
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
/**
|
||||
* Search tokenization will use the same tokenization methods as indexing,
|
||||
* but will combine each group with "OR" operators
|
||||
* @param text
|
||||
* @returns
|
||||
*/
|
||||
export function tokenizeForSearch(text: string): QueryCombination {
|
||||
const tokens = tokenizeTokens(text)
|
||||
const query = {
|
||||
combineWith: 'OR',
|
||||
queries: [
|
||||
{ combineWith: 'AND', queries: tokens },
|
||||
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
||||
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
||||
{ combineWith: 'AND', queries: tokenizeWords(text) },
|
||||
],
|
||||
}
|
||||
logDebug(JSON.stringify(query, null, 1))
|
||||
return query
|
||||
}
|
||||
Reference in New Issue
Block a user