Compare commits
10 Commits
50db35b667
...
1297a1034a
| Author | SHA1 | Date | |
|---|---|---|---|
| 1297a1034a | |||
| b195bf65ee | |||
| 2ef3a1392f | |||
| c75d5d89f7 | |||
| df73ab0f1c | |||
| 637c20905e | |||
| c4c4e782fb | |||
| 2b00a7af2d | |||
| 3c84980903 | |||
| f17f9756a3 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -22,3 +22,4 @@ dist
|
|||||||
coverage
|
coverage
|
||||||
package-lock.json
|
package-lock.json
|
||||||
Doc Omnisearch/.obsidian
|
Doc Omnisearch/.obsidian
|
||||||
|
.aider*
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ export type AIImageAnalyzerAPI = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export const SEPARATORS =
|
export const SEPARATORS =
|
||||||
/[|\t\n\r\^"= -#%-*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
|
/[|\t\n\r\^"= -#-%&(*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2017\u201A-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
|
||||||
.toString()
|
.toString()
|
||||||
.slice(1, -1)
|
.slice(1, -1)
|
||||||
export const SPACE_OR_PUNCTUATION = new RegExp(`${SEPARATORS}+`, 'u')
|
export const SPACE_OR_PUNCTUATION = new RegExp(`${SEPARATORS}+`, 'u')
|
||||||
|
|||||||
@@ -229,6 +229,28 @@ export class DocumentsRepository {
|
|||||||
metadata?.frontmatter?.[this.plugin.settings.displayTitle] ?? ''
|
metadata?.frontmatter?.[this.plugin.settings.displayTitle] ?? ''
|
||||||
}
|
}
|
||||||
const tags = getTagsFromMetadata(metadata)
|
const tags = getTagsFromMetadata(metadata)
|
||||||
|
const headings1 = metadata ? extractHeadingsFromCache(metadata, 1) : []
|
||||||
|
const headings2 = metadata ? extractHeadingsFromCache(metadata, 2) : []
|
||||||
|
const headings3 = metadata ? extractHeadingsFromCache(metadata, 3) : []
|
||||||
|
|
||||||
|
const lines = content.split('\n')
|
||||||
|
const colonHeadings: string[] = []
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
const line = lines[i].trim()
|
||||||
|
if (line.endsWith(':')) {
|
||||||
|
const prevLine = i > 0 ? lines[i - 1].trim() : null
|
||||||
|
const nextLine = i < lines.length - 1 ? lines[i + 1].trim() : null
|
||||||
|
|
||||||
|
if (
|
||||||
|
prevLine === '' &&
|
||||||
|
nextLine !== null &&
|
||||||
|
nextLine !== ''
|
||||||
|
) {
|
||||||
|
colonHeadings.push(line.slice(0, -1).trim())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
basename: file.basename,
|
basename: file.basename,
|
||||||
displayTitle,
|
displayTitle,
|
||||||
@@ -241,15 +263,9 @@ export class DocumentsRepository {
|
|||||||
tags: tags,
|
tags: tags,
|
||||||
unmarkedTags: tags.map(t => t.replace('#', '')),
|
unmarkedTags: tags.map(t => t.replace('#', '')),
|
||||||
aliases: getAliasesFromMetadata(metadata).join(''),
|
aliases: getAliasesFromMetadata(metadata).join(''),
|
||||||
headings1: metadata
|
headings1: headings1.join(' '),
|
||||||
? extractHeadingsFromCache(metadata, 1).join(' ')
|
headings2: headings2.join(' '),
|
||||||
: '',
|
headings3: [...headings3, ...colonHeadings].join(' '),
|
||||||
headings2: metadata
|
|
||||||
? extractHeadingsFromCache(metadata, 2).join(' ')
|
|
||||||
: '',
|
|
||||||
headings3: metadata
|
|
||||||
? extractHeadingsFromCache(metadata, 3).join(' ')
|
|
||||||
: '',
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import {
|
|||||||
type DocumentRef,
|
type DocumentRef,
|
||||||
type IndexedDocument,
|
type IndexedDocument,
|
||||||
type ResultNote,
|
type ResultNote,
|
||||||
|
type SearchMatch,
|
||||||
} from '../globals'
|
} from '../globals'
|
||||||
|
|
||||||
import {
|
import {
|
||||||
@@ -22,6 +23,8 @@ import { sortBy } from 'lodash-es'
|
|||||||
import type OmnisearchPlugin from '../main'
|
import type OmnisearchPlugin from '../main'
|
||||||
import { Tokenizer } from './tokenizer'
|
import { Tokenizer } from './tokenizer'
|
||||||
|
|
||||||
|
const STOP_WORDS = new Set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "by", "for", "with", "to", "from", "of", "is", "it", "that", "this"])
|
||||||
|
|
||||||
export class SearchEngine {
|
export class SearchEngine {
|
||||||
private tokenizer: Tokenizer
|
private tokenizer: Tokenizer
|
||||||
private minisearch: MiniSearch
|
private minisearch: MiniSearch
|
||||||
@@ -481,6 +484,16 @@ export class SearchEngine {
|
|||||||
query
|
query
|
||||||
)
|
)
|
||||||
|
|
||||||
|
let bestMatch: SearchMatch | undefined
|
||||||
|
if (
|
||||||
|
matches.length > 0 &&
|
||||||
|
(query.query.text.length > 1 || query.getExactTerms().length > 0) &&
|
||||||
|
query.getBestStringForExcerpt() &&
|
||||||
|
matches[0].match.toLowerCase() === query.getBestStringForExcerpt()
|
||||||
|
) {
|
||||||
|
bestMatch = matches.shift()
|
||||||
|
}
|
||||||
|
|
||||||
const lowerCaseBasename = note.basename.toLowerCase()
|
const lowerCaseBasename = note.basename.toLowerCase()
|
||||||
const titleMatchWord = foundWords.find(word =>
|
const titleMatchWord = foundWords.find(word =>
|
||||||
lowerCaseBasename.includes(word.toLowerCase())
|
lowerCaseBasename.includes(word.toLowerCase())
|
||||||
@@ -514,6 +527,10 @@ export class SearchEngine {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bestMatch) {
|
||||||
|
matches.unshift(bestMatch)
|
||||||
|
}
|
||||||
|
|
||||||
logVerbose(`Matches for note "${note.path}"`, matches)
|
logVerbose(`Matches for note "${note.path}"`, matches)
|
||||||
const resultNote: ResultNote = {
|
const resultNote: ResultNote = {
|
||||||
score: result.score,
|
score: result.score,
|
||||||
@@ -559,11 +576,20 @@ export class SearchEngine {
|
|||||||
}
|
}
|
||||||
return (doc as any)[fieldName]
|
return (doc as any)[fieldName]
|
||||||
},
|
},
|
||||||
processTerm: (term: string) =>
|
processTerm: (term: string) => {
|
||||||
(this.plugin.settings.ignoreDiacritics
|
const processedTerm = (
|
||||||
? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
|
this.plugin.settings.ignoreDiacritics
|
||||||
: term
|
? removeDiacritics(
|
||||||
).toLowerCase(),
|
term,
|
||||||
|
this.plugin.settings.ignoreArabicDiacritics
|
||||||
|
)
|
||||||
|
: term
|
||||||
|
).toLowerCase()
|
||||||
|
if (processedTerm.length < 3 || STOP_WORDS.has(processedTerm)) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
return processedTerm
|
||||||
|
},
|
||||||
idField: 'path',
|
idField: 'path',
|
||||||
fields: [
|
fields: [
|
||||||
'basename',
|
'basename',
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
import type { QueryCombination } from 'minisearch'
|
import type { Query, QueryCombination } from 'minisearch'
|
||||||
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
|
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
|
||||||
import { logVerbose, splitCamelCase, splitHyphens } from '../tools/utils'
|
import { logVerbose, splitCamelCase, splitHyphens } from '../tools/utils'
|
||||||
import type OmnisearchPlugin from '../main'
|
import type OmnisearchPlugin from '../main'
|
||||||
|
|
||||||
const markdownLinkExtractor = require('markdown-link-extractor')
|
const markdownLinkExtractor = require('markdown-link-extractor')
|
||||||
|
|
||||||
|
const STOP_WORDS = new Set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "by", "for", "with", "to", "from", "of", "is", "it", "that", "this"])
|
||||||
|
|
||||||
export class Tokenizer {
|
export class Tokenizer {
|
||||||
constructor(private plugin: OmnisearchPlugin) {}
|
constructor(private plugin: OmnisearchPlugin) {}
|
||||||
|
|
||||||
@@ -60,21 +62,47 @@ export class Tokenizer {
|
|||||||
public tokenizeForSearch(text: string): QueryCombination {
|
public tokenizeForSearch(text: string): QueryCombination {
|
||||||
// Extract urls and remove them from the query
|
// Extract urls and remove them from the query
|
||||||
const urls: string[] = markdownLinkExtractor(text)
|
const urls: string[] = markdownLinkExtractor(text)
|
||||||
|
const originalText = text
|
||||||
text = urls.reduce((acc, url) => acc.replace(url, ''), text)
|
text = urls.reduce((acc, url) => acc.replace(url, ''), text)
|
||||||
|
|
||||||
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
|
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
|
||||||
|
|
||||||
|
const isStopWord = (term: string): boolean => {
|
||||||
|
const lower = term.toLowerCase()
|
||||||
|
return lower.length < 3 || STOP_WORDS.has(lower)
|
||||||
|
}
|
||||||
|
|
||||||
|
const queries = [
|
||||||
|
{ combineWith: 'AND', queries: [originalText] },
|
||||||
|
{ combineWith: 'AND', queries: tokens },
|
||||||
|
{
|
||||||
|
combineWith: 'AND',
|
||||||
|
queries: this.tokenizeWords(text).filter(Boolean),
|
||||||
|
},
|
||||||
|
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
||||||
|
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
||||||
|
].map(q => ({
|
||||||
|
...q,
|
||||||
|
queries: q.queries.filter(t => !isStopWord(t)),
|
||||||
|
}))
|
||||||
|
|
||||||
|
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
|
||||||
|
|
||||||
|
// Deduplicate
|
||||||
|
const uniqueQueries = []
|
||||||
|
const seen = new Set()
|
||||||
|
for (const q of nonEmptyQueries) {
|
||||||
|
// sort to make order irrelevant for duplication check
|
||||||
|
const key = JSON.stringify(q.queries.sort())
|
||||||
|
if (!seen.has(key)) {
|
||||||
|
uniqueQueries.push(q)
|
||||||
|
seen.add(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
combineWith: 'OR',
|
combineWith: 'OR',
|
||||||
queries: [
|
queries: uniqueQueries as Query[],
|
||||||
{ combineWith: 'AND', queries: tokens },
|
|
||||||
{
|
|
||||||
combineWith: 'AND',
|
|
||||||
queries: this.tokenizeWords(text).filter(Boolean),
|
|
||||||
},
|
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ export class TextProcessor {
|
|||||||
words: string[],
|
words: string[],
|
||||||
query?: Query
|
query?: Query
|
||||||
): SearchMatch[] {
|
): SearchMatch[] {
|
||||||
words = words.map(escapeHTML)
|
|
||||||
const reg = this.stringsToRegex(words)
|
const reg = this.stringsToRegex(words)
|
||||||
const originalText = text
|
const originalText = text
|
||||||
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
|
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
|
||||||
@@ -96,11 +95,16 @@ export class TextProcessor {
|
|||||||
query &&
|
query &&
|
||||||
(query.query.text.length > 1 || query.getExactTerms().length > 0)
|
(query.query.text.length > 1 || query.getExactTerms().length > 0)
|
||||||
) {
|
) {
|
||||||
const best = text.indexOf(query.getBestStringForExcerpt())
|
const bestMatchStr = query.getBestStringForExcerpt()
|
||||||
if (best > -1 && matches.find(m => m.offset === best)) {
|
const best = text.toLowerCase().indexOf(bestMatchStr)
|
||||||
|
if (best > -1) {
|
||||||
|
// We found the full query. We make it the first result, and remove any other match that it contains.
|
||||||
|
matches = matches.filter(
|
||||||
|
m => m.offset < best || m.offset >= best + bestMatchStr.length
|
||||||
|
)
|
||||||
matches.unshift({
|
matches.unshift({
|
||||||
offset: best,
|
offset: best,
|
||||||
match: query.getBestStringForExcerpt(),
|
match: originalText.substring(best, best + bestMatchStr.length),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user