Merge branch 'master' into develop

This commit is contained in:
Simon Cambier
2024-03-11 10:34:04 +01:00
20 changed files with 1598 additions and 2138 deletions

View File

@@ -1,5 +1,72 @@
# Omnisearch Changelog # Omnisearch Changelog
This changelog is not exhaustive.
## 1.21.x
- Added support for .docx and .xlsx
## 1.20.x
- Refactored indexing tokenization process to correctly take diacritics into account
- Added highlighting in the note's path
- Improved the selection of the chosen excerpt in the results list
## 1.19.x
- Various bugfixes and improvements
## 1.18.x
- Added a localhost server to use Omnisearch's API from outside Obsidian
## 1.17.x
- Added a shortcut to open files without closing Omnisearch
- Prefill the search field with selected text
- Improved highlighting
## 1.16.x
- Various indexing/tokenization improvements
## 1.15.x
- Added support of webp images
- Configurable fuzziness
- Added support for DataLoom plugin files
- Unsupported files are now indexed by their path
- Unmarked tags are now slightly boosted
## 1.14.x
- Added a `path:` option
- Bugfixes
## 1.13.x
- CamelCaseWords are now indexed as 3 words
- Reduced search freezes in some cases
## 1.12.x
- You can filter files by their extension
- Refreshed UI
- New API functions
- Fixed some tokenization issues
## 1.10.x - 1.11.x
- Added support for Text Extractor; Omnisearch no longer extracts text itself
- Added canvas indexing
- Improved tags indexing
## 1.9.x
- PDFs are no longer indexed on mobile
- Performance improvements
- Various bugfixes
## 1.8.x ## 1.8.x
- Added OCR for images - Added OCR for images

View File

@@ -39,6 +39,7 @@ You can check the [CHANGELOG](./CHANGELOG.md) for more information on the differ
its filename, and its headings its filename, and its headings
- Keyboard first: you never have to use your mouse - Keyboard first: you never have to use your mouse
- Workflow similar to the "Quick Switcher" core plugin - Workflow similar to the "Quick Switcher" core plugin
- Opt-in local HTTP server to query Omnisearch from outside of Obsidian
- Resistance to typos - Resistance to typos
- Switch between Vault and In-file search to quickly skim multiple results in a single note - Switch between Vault and In-file search to quickly skim multiple results in a single note
- Supports `"expressions in quotes"` and `-exclusions` - Supports `"expressions in quotes"` and `-exclusions`

View File

@@ -1,7 +1,7 @@
{ {
"id": "omnisearch", "id": "omnisearch",
"name": "Omnisearch", "name": "Omnisearch",
"version": "1.20.0-beta.1", "version": "1.21.1",
"minAppVersion": "1.3.0", "minAppVersion": "1.3.0",
"description": "A search engine that just works", "description": "A search engine that just works",
"author": "Simon Cambier", "author": "Simon Cambier",

View File

@@ -1,7 +1,7 @@
{ {
"id": "omnisearch", "id": "omnisearch",
"name": "Omnisearch", "name": "Omnisearch",
"version": "1.19.0", "version": "1.21.1",
"minAppVersion": "1.3.0", "minAppVersion": "1.3.0",
"description": "A search engine that just works", "description": "A search engine that just works",
"author": "Simon Cambier", "author": "Simon Cambier",

View File

@@ -1,6 +1,6 @@
{ {
"name": "scambier.obsidian-search", "name": "scambier.obsidian-search",
"version": "1.20.0-beta.1", "version": "1.21.1",
"description": "A search engine for Obsidian", "description": "A search engine for Obsidian",
"main": "dist/main.js", "main": "dist/main.js",
"scripts": { "scripts": {
@@ -14,36 +14,36 @@
"author": "Simon Cambier", "author": "Simon Cambier",
"license": "GPL-3", "license": "GPL-3",
"devDependencies": { "devDependencies": {
"@babel/preset-env": "^7.20.2", "@babel/preset-env": "^7.23.8",
"@babel/preset-typescript": "^7.18.6", "@babel/preset-typescript": "^7.23.3",
"@testing-library/jest-dom": "^5.16.5", "@testing-library/jest-dom": "^5.17.0",
"@tsconfig/svelte": "^3.0.0", "@tsconfig/svelte": "^3.0.0",
"@types/jest": "^27.5.2", "@types/jest": "^27.5.2",
"@types/lodash-es": "^4.17.6", "@types/lodash-es": "^4.17.12",
"@types/node": "^16.18.7", "@types/node": "^16.18.74",
"@types/pako": "^2.0.0", "@types/pako": "^2.0.3",
"babel-jest": "^27.5.1", "babel-jest": "^27.5.1",
"builtin-modules": "^3.3.0", "builtin-modules": "^3.3.0",
"esbuild": "0.14.0", "esbuild": "0.14.0",
"esbuild-plugin-copy": "1.3.0", "esbuild-plugin-copy": "1.3.0",
"esbuild-svelte": "0.7.1", "esbuild-svelte": "0.7.1",
"jest": "^27.5.1", "jest": "^27.5.1",
"obsidian": "^1.4.11", "obsidian": "1.3.5",
"prettier": "^2.8.1", "prettier": "^2.8.8",
"prettier-plugin-svelte": "^2.8.1", "prettier-plugin-svelte": "^2.10.1",
"svelte": "^3.54.0", "svelte": "^3.59.2",
"svelte-check": "^2.10.2", "svelte-check": "^2.10.3",
"svelte-jester": "^2.3.2", "svelte-jester": "^2.3.2",
"svelte-preprocess": "^4.10.7", "svelte-preprocess": "^4.10.7",
"tslib": "2.3.1", "tslib": "2.3.1",
"typescript": "^4.9.4", "typescript": "^4.9.5",
"vite": "^3.2.5" "vite": "^3.2.8"
}, },
"dependencies": { "dependencies": {
"cancelable-promise": "^4.3.1", "cancelable-promise": "^4.3.1",
"dexie": "^3.2.2", "dexie": "^3.2.4",
"lodash-es": "4.17.21", "lodash-es": "4.17.21",
"minisearch": "6.0.0-beta.1", "minisearch": "^6.3.0",
"pure-md5": "^0.1.14", "pure-md5": "^0.1.14",
"search-query-parser": "^1.6.0" "search-query-parser": "^1.6.0"
}, },

3281
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -13,10 +13,13 @@ import {
isFileFromDataloomPlugin, isFileFromDataloomPlugin,
isFileImage, isFileImage,
isFilePDF, isFilePDF,
isFileOffice,
isFilePlaintext, isFilePlaintext,
isFilenameIndexable, isFilenameIndexable,
logDebug, logDebug,
makeMD5, makeMD5,
removeDiacritics,
stripMarkdownCharacters,
} from './tools/utils' } from './tools/utils'
import type { CanvasData } from 'obsidian/canvas' import type { CanvasData } from 'obsidian/canvas'
import type { AsPlainObject } from 'minisearch' import type { AsPlainObject } from 'minisearch'
@@ -104,6 +107,15 @@ async function getAndMapIndexedDocument(
content = await extractor.extractText(file) content = await extractor.extractText(file)
} }
// ** Office document **
else if (
isFileOffice(path) &&
settings.officeIndexing &&
extractor?.canFileBeExtracted(path)
) {
content = await extractor.extractText(file)
}
// ** Unsupported files ** // ** Unsupported files **
else if (isFilenameIndexable(path)) { else if (isFilenameIndexable(path)) {
content = file.path content = file.path
@@ -143,6 +155,8 @@ async function getAndMapIndexedDocument(
return { return {
basename: file.basename, basename: file.basename,
content, content,
/** Content without diacritics and markdown chars */
cleanedContent: stripMarkdownCharacters(removeDiacritics(content)),
path: file.path, path: file.path,
mtime: file.stat.mtime, mtime: file.stat.mtime,

View File

@@ -19,6 +19,7 @@
import { Query } from 'src/search/query' import { Query } from 'src/search/query'
import { openNote } from 'src/tools/notes' import { openNote } from 'src/tools/notes'
import { searchEngine } from 'src/search/omnisearch' import { searchEngine } from 'src/search/omnisearch'
import { stringsToRegex } from 'src/tools/text-processing'
export let modal: OmnisearchInFileModal export let modal: OmnisearchInFileModal
export let parent: OmnisearchVaultModal | null = null export let parent: OmnisearchVaultModal | null = null
@@ -64,10 +65,20 @@
$: { $: {
if (note) { if (note) {
const groups = getGroups(note.matches) let groups = getGroups(note.matches)
groupedOffsets = groups.map(group =>
Math.round((group.first()!.offset + group.last()!.offset) / 2) // If there are quotes in the search,
) // only show results that match at least one of the quotes
const exactTerms = query.getExactTerms()
if (exactTerms.length) {
groups = groups.filter(group =>
exactTerms.every(exact =>
group.some(match => match.match.includes(exact))
)
)
}
groupedOffsets = groups.map(group => Math.round(group.first()!.offset))
} }
} }
@@ -77,13 +88,12 @@
function getGroups(matches: SearchMatch[]): SearchMatch[][] { function getGroups(matches: SearchMatch[]): SearchMatch[][] {
const groups: SearchMatch[][] = [] const groups: SearchMatch[][] = []
let lastOffset = -1 let lastOffset = -1
let count = 0 // TODO: FIXME: this is a hack to avoid infinite loops let count = 0 // Avoid infinite loops
while (true) { while (++count < 100) {
const group = getGroupedMatches(matches, lastOffset, excerptAfter) const group = getGroupedMatches(matches, lastOffset, excerptAfter)
if (!group.length) break if (!group.length) break
lastOffset = group.last()!.offset lastOffset = group.last()!.offset
groups.push(group) groups.push(group)
if (++count > 100) break
} }
return groups return groups
} }
@@ -121,7 +131,9 @@
if (parent) parent.close() if (parent) parent.close()
// Open (or switch focus to) the note // Open (or switch focus to) the note
await openNote(note, newTab) const reg = stringsToRegex(note.foundWords)
reg.exec(note.content)
await openNote(note, reg.lastIndex, newTab)
// Move cursor to the match // Move cursor to the match
const view = app.workspace.getActiveViewOfType(MarkdownView) const view = app.workspace.getActiveViewOfType(MarkdownView)

View File

@@ -116,7 +116,7 @@
historySearchIndex = 0 historySearchIndex = 0
} }
searchQuery = history[historySearchIndex] searchQuery = history[historySearchIndex]
refInput?.setInputValue(searchQuery) refInput?.setInputValue(searchQuery ?? '')
} }
async function nextSearchHistory() { async function nextSearchHistory() {
@@ -125,7 +125,7 @@
historySearchIndex = history.length ? history.length - 1 : 0 historySearchIndex = history.length ? history.length - 1 : 0
} }
searchQuery = history[historySearchIndex] searchQuery = history[historySearchIndex]
refInput?.setInputValue(searchQuery) refInput?.setInputValue(searchQuery ?? '')
} }
let cancelableQuery: CancelablePromise<ResultNote[]> | null = null let cancelableQuery: CancelablePromise<ResultNote[]> | null = null
@@ -186,7 +186,8 @@
function openSearchResult(note: ResultNote, newPane = false) { function openSearchResult(note: ResultNote, newPane = false) {
saveCurrentQuery() saveCurrentQuery()
openNote(note, newPane) const offset = note.matches?.[0]?.offset ?? 0
openNote(note, offset, newPane)
} }
async function onClickCreateNote(_e: MouseEvent) { async function onClickCreateNote(_e: MouseEvent) {

View File

@@ -53,6 +53,7 @@ export type IndexedDocument = {
mtime: number mtime: number
content: string content: string
cleanedContent: string
aliases: string aliases: string
tags: string[] tags: string[]
unmarkedTags: string[] unmarkedTags: string[]
@@ -120,7 +121,7 @@ export function isCacheEnabled(): boolean {
} }
export const SEPARATORS = export const SEPARATORS =
/[|\t\n\r\^= -#%-*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/ /[|\t\n\r\^"= -#%-*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
.toString() .toString()
.slice(1, -1) .slice(1, -1)
export const SPACE_OR_PUNCTUATION = new RegExp(`${SEPARATORS}+`, 'u') export const SPACE_OR_PUNCTUATION = new RegExp(`${SEPARATORS}+`, 'u')

View File

@@ -14,6 +14,7 @@ import {
import { import {
eventBus, eventBus,
EventNames, EventNames,
getTextExtractor,
indexingStep, indexingStep,
IndexingStepType, IndexingStepType,
isCacheEnabled, isCacheEnabled,
@@ -36,8 +37,8 @@ export default class OmnisearchPlugin extends Plugin {
this.addSettingTab(new SettingsTab(this)) this.addSettingTab(new SettingsTab(this))
if (!Platform.isMobile) { if (!Platform.isMobile) {
import('./tools/api-server').then(m => import('./tools/api-server').then(
this.apiHttpServer = m.getServer() m => (this.apiHttpServer = m.getServer())
) )
} }
@@ -119,7 +120,7 @@ export default class OmnisearchPlugin extends Plugin {
}) })
) )
this.executeFirstLaunchTasks() await this.executeFirstLaunchTasks()
await this.populateIndex() await this.populateIndex()
if (this.apiHttpServer && settings.httpApiEnabled) { if (this.apiHttpServer && settings.httpApiEnabled) {
@@ -128,18 +129,17 @@ export default class OmnisearchPlugin extends Plugin {
}) })
} }
executeFirstLaunchTasks(): void { async executeFirstLaunchTasks(): Promise<void> {
const code = '1.10.1' const code = '1.21.0'
if (settings.welcomeMessage !== code) { if (settings.welcomeMessage !== code && getTextExtractor()) {
// const welcome = new DocumentFragment() const welcome = new DocumentFragment()
// welcome.createSpan({}, span => { welcome.createSpan({}, span => {
// span.innerHTML = `🔎 Omnisearch now requires the <strong>Text Extractor</strong> plugin to index PDF and images. See Omnisearch settings for more information.` span.innerHTML = `🔎 Omnisearch can now index .docx and .xlsx documents. Don't forget to update Text Extractor and enable the toggle in Omnisearch settings.`
// }) })
// new Notice(welcome, 20_000) new Notice(welcome, 20_000)
} }
settings.welcomeMessage = code settings.welcomeMessage = code
await this.saveData(settings)
this.saveData(settings)
} }
async onunload(): Promise<void> { async onunload(): Promise<void> {

View File

@@ -1,59 +1,18 @@
import MiniSearch, { type Options, type SearchResult } from 'minisearch' import MiniSearch, { type Options, type SearchResult } from 'minisearch'
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals' import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
import {
BRACKETS_AND_SPACE,
chsRegex,
getChsSegmenter,
SPACE_OR_PUNCTUATION,
} from '../globals'
import { settings } from '../settings' import { settings } from '../settings'
import { import { chunkArray, logDebug, removeDiacritics } from '../tools/utils'
chunkArray,
logDebug,
removeDiacritics,
splitCamelCase,
splitHyphens,
stripMarkdownCharacters,
} from '../tools/utils'
import { Notice } from 'obsidian' import { Notice } from 'obsidian'
import type { Query } from './query' import type { Query } from './query'
import { cacheManager } from '../cache-manager' import { cacheManager } from '../cache-manager'
import { sortBy } from 'lodash-es' import { sortBy } from 'lodash-es'
import { getMatches, stringsToRegex } from 'src/tools/text-processing' import { getMatches, stringsToRegex } from 'src/tools/text-processing'
import { tokenizeForIndexing, tokenizeForSearch } from './tokenizer'
const tokenize = (text: string): string[] => {
const words = text.split(BRACKETS_AND_SPACE)
let tokens = text.split(SPACE_OR_PUNCTUATION)
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
// When enabled, we only use the chsSegmenter,
// and not the other custom tokenizers
const chsSegmenter = getChsSegmenter()
if (chsSegmenter) {
const chs = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
tokens = [...tokens, ...chs]
}
// Remove duplicates
tokens = [...new Set(tokens)]
return tokens
}
export class Omnisearch { export class Omnisearch {
public static readonly options: Options<IndexedDocument> = { public static readonly options: Options<IndexedDocument> = {
tokenize, tokenize: tokenizeForIndexing,
extractField: (doc, fieldName) => { extractField: (doc, fieldName) => {
if (fieldName === 'directory') { if (fieldName === 'directory') {
// return path without the filename // return path without the filename
@@ -87,6 +46,7 @@ export class Omnisearch {
}, },
} }
private minisearch: MiniSearch private minisearch: MiniSearch
/** Map<path, mtime> */
private indexedDocuments: Map<string, number> = new Map() private indexedDocuments: Map<string, number> = new Map()
// private previousResults: SearchResult[] = [] // private previousResults: SearchResult[] = []
// private previousQuery: Query | null = null // private previousQuery: Query | null = null
@@ -212,14 +172,15 @@ export class Omnisearch {
break break
} }
let results = this.minisearch.search(query.segmentsToStr(), { const searchTokens = tokenizeForSearch(query.segmentsToStr())
logDebug(JSON.stringify(searchTokens, null, 1))
let results = this.minisearch.search(searchTokens, {
prefix: term => term.length >= options.prefixLength, prefix: term => term.length >= options.prefixLength,
// length <= 3: no fuzziness // length <= 3: no fuzziness
// length <= 5: fuzziness of 10% // length <= 5: fuzziness of 10%
// length > 5: fuzziness of 20% // length > 5: fuzziness of 20%
fuzzy: term => fuzzy: term =>
term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness, term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
combineWith: 'AND',
boost: { boost: {
basename: settings.weightBasename, basename: settings.weightBasename,
directory: settings.weightDirectory, directory: settings.weightDirectory,
@@ -321,10 +282,10 @@ export class Omnisearch {
results = results.filter(r => { results = results.filter(r => {
const document = documents.find(d => d.path === r.id) const document = documents.find(d => d.path === r.id)
const title = document?.path.toLowerCase() ?? '' const title = document?.path.toLowerCase() ?? ''
const content = stripMarkdownCharacters( const content = (document?.cleanedContent ?? '').toLowerCase()
document?.content ?? '' return exactTerms.every(
).toLowerCase() q => content.includes(q) || removeDiacritics(title).includes(q)
return exactTerms.every(q => content.includes(q) || title.includes(q)) )
}) })
} }
@@ -333,7 +294,7 @@ export class Omnisearch {
if (exclusions.length) { if (exclusions.length) {
logDebug('Filtering with exclusions') logDebug('Filtering with exclusions')
results = results.filter(r => { results = results.filter(r => {
const content = stripMarkdownCharacters( const content = (
documents.find(d => d.path === r.id)?.content ?? '' documents.find(d => d.path === r.id)?.content ?? ''
).toLowerCase() ).toLowerCase()
return exclusions.every(q => !content.includes(q)) return exclusions.every(q => !content.includes(q))
@@ -402,7 +363,7 @@ export class Omnisearch {
const foundWords = [ const foundWords = [
// Matching terms from the result, // Matching terms from the result,
// do not necessarily match the query // do not necessarily match the query
...Object.keys(result.match), ...result.terms,
// Quoted expressions // Quoted expressions
...query.getExactTerms(), ...query.getExactTerms(),

View File

@@ -46,6 +46,15 @@ export class Query {
} }
this.query = parsed this.query = parsed
// Extract keywords starting with a dot...
const ext = this.query.text
.filter(o => o.startsWith('.'))
.map(o => o.slice(1))
// add them to the ext field...
this.query.ext = [...new Set([...ext, ...(this.query.ext ?? [])])]
// and remove them from the text field
this.query.text = this.query.text.filter(o => !o.startsWith('.'))
// Get strings in quotes, and remove the quotes // Get strings in quotes, and remove the quotes
this.#inQuotes = this.#inQuotes =
text.match(/"([^"]+)"/g)?.map(o => o.replace(/"/g, '')) ?? [] text.match(/"([^"]+)"/g)?.map(o => o.replace(/"/g, '')) ?? []
@@ -76,7 +85,7 @@ export class Query {
} }
/** /**
* *
* @returns An array of strings that are in quotes * @returns An array of strings that are in quotes
*/ */
public getExactTerms(): string[] { public getExactTerms(): string[] {
@@ -89,4 +98,13 @@ export class Query {
), ),
] ]
} }
public getBestStringForExcerpt(): string {
// If we have quoted expressions, return the longest one
if (this.#inQuotes.length) {
return this.#inQuotes.sort((a, b) => b.length - a.length)[0] ?? ''
}
// Otherwise, just return the query as is
return this.segmentsToStr()
}
} }

79
src/search/tokenizer.ts Normal file
View File

@@ -0,0 +1,79 @@
import type { QueryCombination } from 'minisearch'
import {
BRACKETS_AND_SPACE,
SPACE_OR_PUNCTUATION,
chsRegex,
getChsSegmenter,
} from 'src/globals'
import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
function tokenizeWords(text: string): string[] {
return text.split(BRACKETS_AND_SPACE)
}
function tokenizeTokens(text: string): string[] {
return text.split(SPACE_OR_PUNCTUATION)
}
/**
* Tokenization for indexing will possibly return more tokens than the original text.
* This is because we combine different methods of tokenization to get the best results.
* @param text
* @returns
*/
export function tokenizeForIndexing(text: string): string[] {
const words = tokenizeWords(text)
let tokens = tokenizeTokens(text)
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
const chsSegmenter = getChsSegmenter()
if (chsSegmenter) {
const chs = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
tokens = [...tokens, ...chs]
}
// Remove duplicates
tokens = [...new Set(tokens)]
return tokens
}
/**
* Search tokenization will use the same tokenization methods as indexing,
* but will combine each group with "OR" operators
* @param text
* @returns
*/
export function tokenizeForSearch(text: string): QueryCombination {
const tokens = tokenizeTokens(text)
let chs: string[] = []
const chsSegmenter = getChsSegmenter()
if (chsSegmenter) {
chs = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
}
return {
combineWith: 'OR',
queries: [
{ combineWith: 'AND', queries: tokens },
{ combineWith: 'AND', queries: tokenizeWords(text) },
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
{ combineWith: 'AND', queries: chs },
],
}
}

View File

@@ -37,6 +37,9 @@ export interface OmnisearchSettings extends WeightingSettings {
PDFIndexing: boolean PDFIndexing: boolean
/** Enable Images indexing */ /** Enable Images indexing */
imagesIndexing: boolean imagesIndexing: boolean
/** Enable Office documents indexing */
officeIndexing: boolean
/** Enable indexing of unknown files */ /** Enable indexing of unknown files */
unsupportedFilesIndexing: 'yes' | 'no' | 'default' unsupportedFilesIndexing: 'yes' | 'no' | 'default'
/** Activate the small 🔍 button on Obsidian's ribbon */ /** Activate the small 🔍 button on Obsidian's ribbon */
@@ -99,7 +102,7 @@ export class SettingsTab extends PluginSettingTab {
// Sponsor link - Thank you! // Sponsor link - Thank you!
const divSponsor = containerEl.createDiv() const divSponsor = containerEl.createDiv()
divSponsor.innerHTML = ` divSponsor.innerHTML = `
<iframe src="https://github.com/sponsors/scambier/button" title="Sponsor scambier" height="35" width="116" style="border: 0;"></iframe> <iframe sandbox="allow-top-navigation-by-user-activation" src="https://github.com/sponsors/scambier/button" title="Sponsor scambier" height="35" width="116" style="border: 0;"></iframe>
<a href='https://ko-fi.com/B0B6LQ2C' target='_blank'><img height='36' style='border:0px;height:36px;' src='https://cdn.ko-fi.com/cdn/kofi2.png?v=3' border='0' alt='Buy Me a Coffee at ko-fi.com' /></a> <a href='https://ko-fi.com/B0B6LQ2C' target='_blank'><img height='36' style='border:0px;height:36px;' src='https://cdn.ko-fi.com/cdn/kofi2.png?v=3' border='0' alt='Buy Me a Coffee at ko-fi.com' /></a>
` `
@@ -158,11 +161,30 @@ export class SettingsTab extends PluginSettingTab {
) )
.setDisabled(!getTextExtractor()) .setDisabled(!getTextExtractor())
// Office Documents Indexing
const indexOfficesDesc = new DocumentFragment()
indexOfficesDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will use Text Extractor to index the content of your office documents (currently <pre style="display:inline">.docx</pre> and <pre style="display:inline">.xlsx</pre>)`
})
new Setting(containerEl)
.setName(
`Documents content indexing ${getTextExtractor() ? '' : '⚠️ Disabled'}`
)
.setDesc(indexOfficesDesc)
.addToggle(toggle =>
toggle.setValue(settings.officeIndexing).onChange(async v => {
await database.clearCache()
settings.officeIndexing = v
await saveSettings(this.plugin)
})
)
.setDisabled(!getTextExtractor())
// Index filenames of unsupported files // Index filenames of unsupported files
const indexUnsupportedDesc = new DocumentFragment() const indexUnsupportedDesc = new DocumentFragment()
indexUnsupportedDesc.createSpan({}, span => { indexUnsupportedDesc.createSpan({}, span => {
span.innerHTML = ` span.innerHTML = `
Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>, <pre style="display:inline">.xlsx</pre>, Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>
or non-extracted PDFs & images.<br/> or non-extracted PDFs & images.<br/>
"Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"` "Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"`
}) })
@@ -175,7 +197,7 @@ export class SettingsTab extends PluginSettingTab {
.setValue(settings.unsupportedFilesIndexing) .setValue(settings.unsupportedFilesIndexing)
.onChange(async v => { .onChange(async v => {
await database.clearCache() await database.clearCache()
;(settings.unsupportedFilesIndexing as any) = v ; (settings.unsupportedFilesIndexing as any) = v
await saveSettings(this.plugin) await saveSettings(this.plugin)
}) })
}) })
@@ -185,7 +207,7 @@ export class SettingsTab extends PluginSettingTab {
indexedFileTypesDesc.createSpan({}, span => { indexedFileTypesDesc.createSpan({}, span => {
span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other <strong style="color: var(--text-accent)">PLAINTEXT</strong> files.<br/> span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other <strong style="color: var(--text-accent)">PLAINTEXT</strong> files.<br/>
Add extensions separated by a space, without the dot. Example: "<code>txt org csv</code>".<br /> Add extensions separated by a space, without the dot. Example: "<code>txt org csv</code>".<br />
⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .docx or .pptx) WILL cause crashes, ⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .pptx) WILL cause crashes,
because Omnisearch will try to index their content.</span>` because Omnisearch will try to index their content.</span>`
}) })
new Setting(containerEl) new Setting(containerEl)
@@ -444,24 +466,6 @@ export class SettingsTab extends PluginSettingTab {
//#endregion Results Weighting //#endregion Results Weighting
//#region Debugging
new Setting(containerEl).setName('Debugging').setHeading()
new Setting(containerEl)
.setName('Enable verbose logging')
.setDesc(
"Adds a LOT of logs for debugging purposes. Don't forget to disable it."
)
.addToggle(toggle =>
toggle.setValue(settings.verboseLogging).onChange(async v => {
settings.verboseLogging = v
await saveSettings(this.plugin)
})
)
//#endregion Debugging
//#region HTTP Server //#region HTTP Server
if (!Platform.isMobile) { if (!Platform.isMobile) {
@@ -521,6 +525,24 @@ export class SettingsTab extends PluginSettingTab {
//#endregion HTTP Server //#endregion HTTP Server
//#region Debugging
new Setting(containerEl).setName('Debugging').setHeading()
new Setting(containerEl)
.setName('Enable verbose logging')
.setDesc(
"Adds a LOT of logs for debugging purposes. Don't forget to disable it."
)
.addToggle(toggle =>
toggle.setValue(settings.verboseLogging).onChange(async v => {
settings.verboseLogging = v
await saveSettings(this.plugin)
})
)
//#endregion Debugging
//#region Danger Zone //#region Danger Zone
new Setting(containerEl).setName('Danger Zone').setHeading() new Setting(containerEl).setName('Danger Zone').setHeading()
@@ -602,6 +624,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
ignoreDiacritics: true, ignoreDiacritics: true,
indexedFileTypes: [] as string[], indexedFileTypes: [] as string[],
PDFIndexing: false, PDFIndexing: false,
officeIndexing: false,
imagesIndexing: false, imagesIndexing: false,
unsupportedFilesIndexing: 'no', unsupportedFilesIndexing: 'no',
splitCamelCase: false, splitCamelCase: false,

View File

@@ -63,7 +63,7 @@ export function getServer() {
close() { close() {
server.close() server.close()
console.log(`Omnisearch - Terminated HTTP server`) console.log(`Omnisearch - Terminated HTTP server`)
if (settings.httpApiNotice) { if (settings.httpApiEnabled && settings.httpApiNotice) {
new Notice(`Omnisearch - Terminated HTTP server`) new Notice(`Omnisearch - Terminated HTTP server`)
} }
}, },

View File

@@ -4,12 +4,9 @@ import { stringsToRegex } from './text-processing'
export async function openNote( export async function openNote(
item: ResultNote, item: ResultNote,
offset = 0,
newPane = false newPane = false
): Promise<void> { ): Promise<void> {
const reg = stringsToRegex(item.foundWords)
reg.exec(item.content)
const offset = reg.lastIndex
// Check if the note is already open, // Check if the note is already open,
// to avoid opening it twice if the first one is pinned // to avoid opening it twice if the first one is pinned
let alreadyOpenAndPinned = false let alreadyOpenAndPinned = false

View File

@@ -14,13 +14,6 @@ import type { Query } from 'src/search/query'
import { Notice } from 'obsidian' import { Notice } from 'obsidian'
import { escapeRegExp } from 'lodash-es' import { escapeRegExp } from 'lodash-es'
export function highlighterGroups(_substring: string, ...args: any[]) {
// args[0] is the single char preceding args[1], which is the word we want to highlight
if (!!args[1].trim())
return `<span>${args[0]}</span><span class="${highlightClass}">${args[1]}</span>`
return '&lt;no content&gt;'
}
/** /**
* Wraps the matches in the text with a <span> element and a highlight class * Wraps the matches in the text with a <span> element and a highlight class
* @param text * @param text
@@ -134,18 +127,18 @@ export function getMatches(
.substring(matchStartIndex, matchEndIndex) .substring(matchStartIndex, matchEndIndex)
.trim() .trim()
if (originalMatch && match.index >= 0) { if (originalMatch && match.index >= 0) {
matches.push({ match: originalMatch, offset: match.index + 1 }) matches.push({ match: originalMatch, offset: match.index })
} }
} }
// If the query is more than 1 token and can be found "as is" in the text, put this match first // If the query is more than 1 token and can be found "as is" in the text, put this match first
if (query && query.query.text.length > 1) { if (query && (query.query.text.length > 1 || query.getExactTerms().length > 0)) {
const best = text.indexOf(query.segmentsToStr()) const best = text.indexOf(query.getBestStringForExcerpt())
if (best > -1 && matches.find(m => m.offset === best)) { if (best > -1 && matches.find(m => m.offset === best)) {
matches = matches.filter(m => m.offset !== best) matches = matches.filter(m => m.offset !== best)
matches.unshift({ matches.unshift({
offset: best, offset: best,
match: query.segmentsToStr(), match: query.getBestStringForExcerpt(),
}) })
} }
} }

View File

@@ -9,10 +9,6 @@ import { canIndexUnsupportedFiles, settings } from '../settings'
import { type BinaryLike, createHash } from 'crypto' import { type BinaryLike, createHash } from 'crypto'
import { md5 } from 'pure-md5' import { md5 } from 'pure-md5'
// export function highlighter(str: string): string {
// return `<span class="${highlightClass}">${str}</span>`
// }
export function pathWithoutFilename(path: string): string { export function pathWithoutFilename(path: string): string {
const split = path.split('/') const split = path.split('/')
split.pop() split.pop()
@@ -174,6 +170,11 @@ export function isFilePDF(path: string): boolean {
return getExtension(path) === 'pdf' return getExtension(path) === 'pdf'
} }
export function isFileOffice(path: string): boolean {
const ext = getExtension(path)
return ext === 'docx' || ext === 'xlsx'
}
export function isFilePlaintext(path: string): boolean { export function isFilePlaintext(path: string): boolean {
return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`)) return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`))
} }

View File

@@ -122,5 +122,12 @@
"1.18.1": "1.3.0", "1.18.1": "1.3.0",
"1.19.0-beta.1": "1.3.0", "1.19.0-beta.1": "1.3.0",
"1.19.0": "1.3.0", "1.19.0": "1.3.0",
"1.20.0-beta.1": "1.3.0" "1.20.0-beta.1": "1.3.0",
"1.20.0": "1.3.0",
"1.20.1": "1.3.0",
"1.20.2": "1.3.0",
"1.20.3": "1.3.0",
"1.20.4": "1.3.0",
"1.21.0": "1.3.0",
"1.21.1": "1.3.0"
} }