From 484e961a7e082a9caa6f5ad606ec36402f37e6f9 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Tue, 26 Mar 2024 21:32:03 +0100 Subject: [PATCH] Highlighting rework, should help with #304 --- src/components/ResultItemInFile.svelte | 12 +++--------- src/components/ResultItemVault.svelte | 6 +----- src/search/omnisearch.ts | 4 +++- src/search/tokenizer.ts | 16 ++++++++++++++-- src/tools/api.ts | 2 +- src/tools/text-processing.ts | 23 ++++++++++------------- 6 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/components/ResultItemInFile.svelte b/src/components/ResultItemInFile.svelte index db9bfa5..096d9db 100644 --- a/src/components/ResultItemInFile.svelte +++ b/src/components/ResultItemInFile.svelte @@ -1,8 +1,5 @@
- {@html highlightText(cleanedContent.content, matchesExcerpt)} + {@html highlightText(cleanedContent, matchesExcerpt)}
diff --git a/src/components/ResultItemVault.svelte b/src/components/ResultItemVault.svelte index 9357dae..b1f9970 100644 --- a/src/components/ResultItemVault.svelte +++ b/src/components/ResultItemVault.svelte @@ -40,10 +40,6 @@ $: reg = stringsToRegex(note.foundWords) $: matchesTitle = getMatches(title, reg) $: matchesNotePath = getMatches(notePath, reg) - $: matchesExcerpt = cloneDeep(note.matches).map(m => { - m.offset = m.offset - cleanedContent.offset - return m - }) $: cleanedContent = makeExcerpt(note.content, note.matches[0]?.offset ?? -1) $: glyph = false //cacheManager.getLiveDocument(note.path)?.doesNotExist $: { @@ -102,7 +98,7 @@
{#if $showExcerpt}
- {@html highlightText(cleanedContent.content, matchesExcerpt)} + {@html highlightText(cleanedContent, note.matches)}
{/if} diff --git a/src/search/omnisearch.ts b/src/search/omnisearch.ts index 4411dc3..88e9cb3 100644 --- a/src/search/omnisearch.ts +++ b/src/search/omnisearch.ts @@ -190,6 +190,8 @@ export class Omnisearch { headings3: settings.weightH3, unmarkedTags: settings.weightUnmarkedTags, }, + // The query is already tokenized, don't tokenize again + tokenize: text => [text], }) logDebug('Found', results.length, 'results') @@ -404,7 +406,7 @@ export class Omnisearch { // Tags, starting with # ...query.getTags(), - ].filter(w => w.length > 1 || /\p{Emoji}/u.test(w)) + ] logDebug('Matching tokens:', foundWords) logDebug('Getting matches locations...') diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts index 2fca57e..29816e2 100644 --- a/src/search/tokenizer.ts +++ b/src/search/tokenizer.ts @@ -6,6 +6,7 @@ import { getChsSegmenter, } from 'src/globals' import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils' +const markdownLinkExtractor = require('markdown-link-extractor') function tokenizeWords(text: string): string[] { return text.split(BRACKETS_AND_SPACE) @@ -23,6 +24,7 @@ function tokenizeTokens(text: string): string[] { */ export function tokenizeForIndexing(text: string): string[] { const words = tokenizeWords(text) + const urls: string[] = markdownLinkExtractor(text) let tokens = tokenizeTokens(text) @@ -35,6 +37,11 @@ export function tokenizeForIndexing(text: string): string[] { // Add whole words (aka "not tokens") tokens = [...tokens, ...words] + // Add urls + if (urls.length) { + tokens = [...tokens, ...urls] + } + const chsSegmenter = getChsSegmenter() if (chsSegmenter) { const chs = tokens.flatMap(word => @@ -56,7 +63,12 @@ export function tokenizeForIndexing(text: string): string[] { * @returns */ export function tokenizeForSearch(text: string): QueryCombination { - const tokens = tokenizeTokens(text) + + // Extract urls and remove them from the query + const urls: string[] = markdownLinkExtractor(text) + text = urls.reduce((acc, url) => acc.replace(url, ''), text) + + const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean) let chs: string[] = [] const chsSegmenter = getChsSegmenter() @@ -70,7 +82,7 @@ export function tokenizeForSearch(text: string): QueryCombination { combineWith: 'OR', queries: [ { combineWith: 'AND', queries: tokens }, - { combineWith: 'AND', queries: tokenizeWords(text) }, + { combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) }, { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, { combineWith: 'AND', queries: chs }, diff --git a/src/tools/api.ts b/src/tools/api.ts index 711253c..93abe7a 100644 --- a/src/tools/api.ts +++ b/src/tools/api.ts @@ -44,7 +44,7 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] { offset: match.offset, } }), - excerpt: excerpt.content, + excerpt: excerpt, } return res diff --git a/src/tools/text-processing.ts b/src/tools/text-processing.ts index f9d2047..481ea3d 100644 --- a/src/tools/text-processing.ts +++ b/src/tools/text-processing.ts @@ -6,15 +6,12 @@ import { regexStripQuotes, excerptAfter, excerptBefore, - SEPARATORS, } from 'src/globals' import { settings } from 'src/settings' import { removeDiacritics, warnDebug } from './utils' import type { Query } from 'src/search/query' import { Notice } from 'obsidian' import { escapeRegExp } from 'lodash-es' -import { tokenizeForSearch } from 'src/search/tokenizer' -import type { QueryCombination } from 'minisearch' /** * Wraps the matches in the text with a element and a highlight class @@ -115,14 +112,19 @@ export function stringsToRegex(strings: string[]): RegExp { return new RegExp(`${joined}`, 'gui') } +/** + * Returns an array of matches in the text, using the provided regex + * @param text + * @param reg + * @param query + */ export function getMatches( text: string, reg: RegExp, query?: Query ): SearchMatch[] { - const separatorRegExp = new RegExp(SEPARATORS, 'gu') const originalText = text - text = text.toLowerCase().replace(separatorRegExp, ' ') + // text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ') if (settings.ignoreDiacritics) { text = removeDiacritics(text) } @@ -153,21 +155,16 @@ export function getMatches( ) { const best = text.indexOf(query.getBestStringForExcerpt()) if (best > -1 && matches.find(m => m.offset === best)) { - matches = matches.filter(m => m.offset !== best) matches.unshift({ offset: best, match: query.getBestStringForExcerpt(), }) } } - return matches } -export function makeExcerpt( - content: string, - offset: number -): { content: string; offset: number } { +export function makeExcerpt(content: string, offset: number): string { try { const pos = offset ?? -1 const from = Math.max(0, pos - excerptBefore) @@ -201,14 +198,14 @@ export function makeExcerpt( content = content.trim().replaceAll('\n', '
') } - return { content: content, offset: pos } + return content } catch (e) { new Notice( 'Omnisearch - Error while creating excerpt, see developer console' ) console.error(`Omnisearch - Error while creating excerpt`) console.error(e) - return { content: '', offset: -1 } + return '' } }