From 603116158ee8a39dc965edd258da96dcfbb73768 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Thu, 31 Aug 2023 19:23:44 +0200 Subject: [PATCH] Refactored excerpts and highlighting --- src/components/ResultItemInFile.svelte | 15 +- src/components/ResultItemVault.svelte | 14 +- src/globals.ts | 8 +- src/search/omnisearch.ts | 46 +----- src/tools/api.ts | 8 +- src/tools/notes.ts | 2 +- src/tools/text-processing.ts | 204 +++++++++++++++++++++++++ src/tools/utils.ts | 128 +--------------- 8 files changed, 247 insertions(+), 178 deletions(-) create mode 100644 src/tools/text-processing.ts diff --git a/src/components/ResultItemInFile.svelte b/src/components/ResultItemInFile.svelte index 27a5805..db9bfa5 100644 --- a/src/components/ResultItemInFile.svelte +++ b/src/components/ResultItemInFile.svelte @@ -1,19 +1,22 @@
- {@html cleanedContent.replace(reg, highlighterGroups)} + {@html highlightText(cleanedContent.content, matchesExcerpt)}
diff --git a/src/components/ResultItemVault.svelte b/src/components/ResultItemVault.svelte index 3360581..45f8f0d 100644 --- a/src/components/ResultItemVault.svelte +++ b/src/components/ResultItemVault.svelte @@ -3,17 +3,16 @@ import type { ResultNote } from '../globals' import { getExtension, - highlighterGroups, isFileCanvas, isFileImage, isFilePDF, - makeExcerpt, pathWithoutFilename, removeDiacritics, - stringsToRegex, } from '../tools/utils' import ResultItemContainer from './ResultItemContainer.svelte' import { setIcon } from 'obsidian' + import { cloneDeep } from 'lodash-es' + import { stringsToRegex, getMatches, makeExcerpt, highlightText } from 'src/tools/text-processing' export let selected = false export let note: ResultNote @@ -36,6 +35,11 @@ } } $: reg = stringsToRegex(note.foundWords) + $: matchesTitle = getMatches(title, reg) + $: matchesExcerpt = cloneDeep(note.matches).map(m => { + m.offset = m.offset - cleanedContent.offset + return m + }) $: cleanedContent = makeExcerpt(note.content, note.matches[0]?.offset ?? -1) $: glyph = false //cacheManager.getLiveDocument(note.path)?.doesNotExist $: { @@ -70,7 +74,7 @@
- {@html title.replace(reg, highlighterGroups)} + {@html highlightText(title, matchesTitle)} .{getExtension(note.path)} @@ -97,7 +101,7 @@
{#if $showExcerpt}
- {@html cleanedContent.replace(reg, highlighterGroups)} + {@html highlightText(cleanedContent.content, matchesExcerpt)}
{/if} diff --git a/src/globals.ts b/src/globals.ts index bdb93cc..7f977a5 100644 --- a/src/globals.ts +++ b/src/globals.ts @@ -106,5 +106,9 @@ export function isCacheEnabled(): boolean { return !Platform.isIosApp && settings.useCache } -export const SPACE_OR_PUNCTUATION = - /[|\t\n\r= -`#%-*,.\/:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]+/u +const separators = + /[|\t\n\r= -#%-*,.`\/:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/ + .toString() + .slice(1, -1) +export const SPACE_OR_PUNCTUATION_UNIQUE = new RegExp(`${separators}`, 'u') +export const SPACE_OR_PUNCTUATION = new RegExp(`${separators}+`, 'u') diff --git a/src/search/omnisearch.ts b/src/search/omnisearch.ts index 1c94d66..6adf19b 100644 --- a/src/search/omnisearch.ts +++ b/src/search/omnisearch.ts @@ -1,10 +1,5 @@ import MiniSearch, { type Options, type SearchResult } from 'minisearch' -import type { - DocumentRef, - IndexedDocument, - ResultNote, - SearchMatch, -} from '../globals' +import type { DocumentRef, IndexedDocument, ResultNote } from '../globals' import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals' import { settings } from '../settings' import { @@ -13,14 +8,13 @@ import { removeDiacritics, splitCamelCase, splitHyphens, - stringsToRegex, stripMarkdownCharacters, - warnDebug, } from '../tools/utils' import { Notice } from 'obsidian' import type { Query } from './query' import { cacheManager } from '../cache-manager' import { sortBy } from 'lodash-es' +import { getMatches, stringsToRegex } from 'src/tools/text-processing' const tokenize = (text: string): string[] => { let tokens = text.split(SPACE_OR_PUNCTUATION) @@ -300,8 +294,7 @@ export class Omnisearch { // Sort results and keep the 50 best results = results.sort((a, b) => b.score - a.score).slice(0, 50) - if (results.length) - logDebug('First result:', results[0]) + if (results.length) logDebug('First result:', results[0]) const documents = await Promise.all( results.map(async result => await cacheManager.getDocument(result.id)) @@ -346,35 +339,6 @@ export class Omnisearch { return results } - public getMatches(text: string, reg: RegExp, query: Query): SearchMatch[] { - const startTime = new Date().getTime() - let match: RegExpExecArray | null = null - let matches: SearchMatch[] = [] - let count = 0 - while ((match = reg.exec(text)) !== null) { - // Avoid infinite loops, stop looking after 100 matches or if we're taking too much time - if (++count >= 100 || new Date().getTime() - startTime > 50) { - warnDebug('Stopped getMatches at', count, 'results') - break - } - logDebug('match :', match) - const m = match[1] - if (m) matches.push({ match: m, offset: match.index }) - } - - // If the query can be found "as is" in the text, put this match first - const best = text.toLowerCase().indexOf(query.segmentsToStr()) - if (best > -1) { - matches = matches.filter(m => m.offset !== best) - matches.unshift({ - offset: best, - match: query.segmentsToStr(), - }) - } - - return matches - } - /** * Searches the index, and returns an array of ResultNote objects. * If we have the singleFile option set, @@ -435,12 +399,12 @@ export class Omnisearch { logDebug('Matching tokens:', foundWords) logDebug('Getting matches locations...') - const matches = this.getMatches( + const matches = getMatches( note.content, stringsToRegex(foundWords), query ) - logDebug('Matches:', matches) + logDebug(`Matches for ${note.basename}`, matches) const resultNote: ResultNote = { score: result.score, foundWords, diff --git a/src/tools/api.ts b/src/tools/api.ts index 440f6e8..eae2d7b 100644 --- a/src/tools/api.ts +++ b/src/tools/api.ts @@ -1,7 +1,7 @@ import type { ResultNote } from '../globals' import { Query } from '../search/query' import { searchEngine } from '../search/omnisearch' -import { makeExcerpt } from './utils' +import { makeExcerpt } from './text-processing' import { refreshIndex } from '../notes-index' type ResultNoteApi = { @@ -31,7 +31,7 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] { const excerpt = makeExcerpt(content, matches[0]?.offset ?? -1) - return { + const res: ResultNoteApi = { score, path, basename, @@ -42,8 +42,10 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] { offset: match.offset, } }), - excerpt, + excerpt: excerpt.content, } + + return res }) } diff --git a/src/tools/notes.ts b/src/tools/notes.ts index 746cd16..2d61574 100644 --- a/src/tools/notes.ts +++ b/src/tools/notes.ts @@ -1,6 +1,6 @@ import { type CachedMetadata, MarkdownView, TFile } from 'obsidian' -import { stringsToRegex } from './utils' import type { ResultNote } from '../globals' +import { stringsToRegex } from './text-processing' export async function openNote( item: ResultNote, diff --git a/src/tools/text-processing.ts b/src/tools/text-processing.ts new file mode 100644 index 0000000..197cce6 --- /dev/null +++ b/src/tools/text-processing.ts @@ -0,0 +1,204 @@ +import { + highlightClass, + type SearchMatch, + regexLineSplit, + regexYaml, + getChsSegmenter, + SPACE_OR_PUNCTUATION_UNIQUE, + regexStripQuotes, + excerptAfter, + excerptBefore, +} from 'src/globals' +import { settings } from 'src/settings' +import { escapeRegex, warnDebug } from './utils' +import type { Query } from 'src/search/query' +import { Notice } from 'obsidian' + +export function highlighterGroups(_substring: string, ...args: any[]) { + // args[0] is the single char preceding args[1], which is the word we want to highlight + if (!!args[1].trim()) + return `${args[0]}${args[1]}` + return '<no content>' +} + +export function highlightText(text: string, matches: SearchMatch[]): string { + matches.forEach(matchInfo => { + const matchRegex = new RegExp(`\\b${matchInfo.match}\\b`, 'giu') + const matchOffsets = [] + + let match + while ((match = matchRegex.exec(text)) !== null) { + matchOffsets.push({ index: match.index, text: match[0] }) + } + + if (!matchOffsets.length) { + return text + } + + const closestMatch = matchOffsets.reduce((prev, curr) => { + return Math.abs(curr.index - matchInfo.offset) < + Math.abs(prev.index - matchInfo.offset) + ? curr + : prev + }) + + if (matchOffsets.includes(closestMatch)) { + const originalMatch = closestMatch.text + text = + text.substring(0, closestMatch.index) + + `` + + originalMatch + + '' + + text.substring(closestMatch.index + originalMatch.length) + } + }) + + return text +} + +export function escapeHTML(html: string): string { + return html + .replaceAll('&', '&') + .replaceAll('<', '<') + .replaceAll('>', '>') + .replaceAll('"', '"') + .replaceAll("'", ''') +} + +export function splitLines(text: string): string[] { + return text.split(regexLineSplit).filter(l => !!l && l.length > 2) +} + +export function removeFrontMatter(text: string): string { + // Regex to recognize YAML Front Matter (at beginning of file, 3 hyphens, than any charecter, including newlines, then 3 hyphens). + return text.replace(regexYaml, '') +} + +/** + * Used to find excerpts in a note body, or select which words to highlight + */ +export function stringsToRegex(strings: string[]): RegExp { + if (!strings.length) return /^$/g + + // sort strings by decreasing length, so that longer strings are matched first + strings.sort((a, b) => b.length - a.length) + + const joined = + '(' + + // Default word split is not applied if the user uses the cm-chs-patch plugin + (getChsSegmenter() + ? '' + : // Split on start of line, spaces, punctuation, or capital letters (for camelCase) + // We also add the hyphen to the list of characters that can split words + settings.splitCamelCase + ? `^|${SPACE_OR_PUNCTUATION_UNIQUE.source}|\-|[A-Z]` + : `^|${SPACE_OR_PUNCTUATION_UNIQUE.source}|\-`) + + ')' + + `(${strings.map(s => escapeRegex(s)).join('|')})` + + const reg = new RegExp(`${joined}`, 'gu') + return reg +} + +export function getMatches( + text: string, + reg: RegExp, + query?: Query +): SearchMatch[] { + text = text.toLowerCase() + const startTime = new Date().getTime() + let match: RegExpExecArray | null = null + let matches: SearchMatch[] = [] + let count = 0 + while ((match = reg.exec(text)) !== null) { + // Avoid infinite loops, stop looking after 100 matches or if we're taking too much time + if (++count >= 100 || new Date().getTime() - startTime > 50) { + warnDebug('Stopped getMatches at', count, 'results') + break + } + const m = match[2] + if (m && match.index >= 0) { + matches.push({ match: m, offset: match.index + 1 }) + } + } + + // If the query can be found "as is" in the text, put this match first + if (query) { + const best = text.indexOf(query.segmentsToStr()) + if (best > -1 && matches.find(m => m.offset === best)) { + matches = matches.filter(m => m.offset !== best) + matches.unshift({ + offset: best, + match: query.segmentsToStr(), + }) + } + } + + return matches +} + +export function makeExcerpt( + content: string, + offset: number +): { content: string; offset: number } { + try { + const pos = offset ?? -1 + const from = Math.max(0, pos - excerptBefore) + const to = Math.min(content.length, pos + excerptAfter) + if (pos > -1) { + content = + (from > 0 ? '…' : '') + + content.slice(from, to).trim() + + (to < content.length - 1 ? '…' : '') + } else { + content = content.slice(0, excerptAfter) + } + if (settings.renderLineReturnInExcerpts) { + const lineReturn = new RegExp(/(?:\r\n|\r|\n)/g) + // Remove multiple line returns + content = content + .split(lineReturn) + .filter(l => l) + .join('\n') + + const last = content.lastIndexOf('\n', pos - from) + + if (last > 0) { + content = content.slice(last) + } + } + + content = escapeHTML(content) + + if (settings.renderLineReturnInExcerpts) { + content = content.trim().replaceAll('\n', '
') + } + + return { content: content, offset: pos } + } catch (e) { + new Notice( + 'Omnisearch - Error while creating excerpt, see developer console' + ) + console.error(`Omnisearch - Error while creating excerpt`) + console.error(e) + return { content: '', offset: -1 } + } +} + +/** + * splits a string in words or "expressions in quotes" + * @param str + * @returns + */ +export function splitQuotes(str: string): string[] { + return ( + str + .match(/"(.*?)"/g) + ?.map(s => s.replace(/"/g, '')) + .filter(q => !!q) ?? [] + ) +} + +export function stripSurroundingQuotes(str: string): string { + return str.replace(regexStripQuotes, '') +} diff --git a/src/tools/utils.ts b/src/tools/utils.ts index a8eacfd..50fabba 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -1,52 +1,17 @@ import { type CachedMetadata, getAllTags, - Notice, parseFrontMatterAliases, Platform, } from 'obsidian' -import { - excerptAfter, - excerptBefore, - getChsSegmenter, - getTextExtractor, - highlightClass, - isSearchMatch, - regexLineSplit, - regexStripQuotes, - regexYaml, - SPACE_OR_PUNCTUATION, - type SearchMatch, -} from '../globals' +import { getTextExtractor, isSearchMatch, type SearchMatch } from '../globals' import { canIndexUnsupportedFiles, settings } from '../settings' import { type BinaryLike, createHash } from 'crypto' import { md5 } from 'pure-md5' -export function highlighter(str: string): string { - return `${str}` -} - -export function highlighterGroups(substring: string, ...args: any[]): string { - return `${substring}` -} - -export function escapeHTML(html: string): string { - return html - .replaceAll('&', '&') - .replaceAll('<', '<') - .replaceAll('>', '>') - .replaceAll('"', '"') - .replaceAll("'", ''') -} - -export function splitLines(text: string): string[] { - return text.split(regexLineSplit).filter(l => !!l && l.length > 2) -} - -export function removeFrontMatter(text: string): string { - // Regex to recognize YAML Front Matter (at beginning of file, 3 hyphens, than any charecter, including newlines, then 3 hyphens). - return text.replace(regexYaml, '') -} +// export function highlighter(str: string): string { +// return `${str}` +// } export function pathWithoutFilename(path: string): string { const split = path.split('/') @@ -79,20 +44,6 @@ export function getAllIndices(text: string, regex: RegExp): SearchMatch[] { .filter(isSearchMatch) } -/** - * Used to find excerpts in a note body, or select which words to highlight - */ -export function stringsToRegex(strings: string[]): RegExp { - if (!strings.length) return /^$/g - - // sort strings by decreasing length, so that longer strings are matched first - strings.sort((a, b) => b.length - a.length) - - const joined = `(${strings.map(s => escapeRegex(s)).join('|')})` - - return new RegExp(`${joined}`, 'giu') -} - export function extractHeadingsFromCache( cache: CachedMetadata, level: number @@ -106,69 +57,6 @@ export function loopIndex(index: number, nbItems: number): number { return (index + nbItems) % nbItems } -export function makeExcerpt(content: string, offset: number): string { - try { - const pos = offset ?? -1 - const from = Math.max(0, pos - excerptBefore) - const to = Math.min(content.length, pos + excerptAfter) - if (pos > -1) { - content = - (from > 0 ? '…' : '') + - content.slice(from, to).trim() + - (to < content.length - 1 ? '…' : '') - } else { - content = content.slice(0, excerptAfter) - } - if (settings.renderLineReturnInExcerpts) { - const lineReturn = new RegExp(/(?:\r\n|\r|\n)/g) - // Remove multiple line returns - content = content - .split(lineReturn) - .filter(l => l) - .join('\n') - - const last = content.lastIndexOf('\n', pos - from) - - if (last > 0) { - content = content.slice(last) - } - } - - content = escapeHTML(content) - - if (settings.renderLineReturnInExcerpts) { - content = content.trim().replaceAll('\n', '
') - } - - return content - } catch (e) { - new Notice( - 'Omnisearch - Error while creating excerpt, see developer console' - ) - console.error(`Omnisearch - Error while creating excerpt`) - console.error(e) - return '' - } -} - -/** - * splits a string in words or "expressions in quotes" - * @param str - * @returns - */ -export function splitQuotes(str: string): string[] { - return ( - str - .match(/"(.*?)"/g) - ?.map(s => s.replace(/"/g, '')) - .filter(q => !!q) ?? [] - ) -} - -export function stripSurroundingQuotes(str: string): string { - return str.replace(regexStripQuotes, '') -} - function mapAsync( array: T[], callbackfn: (value: T, index: number, array: T[]) => Promise @@ -263,7 +151,7 @@ export function isContentIndexable(path: string): boolean { export function isFilenameIndexable(path: string): boolean { return ( - (canIndexUnsupportedFiles()) || + canIndexUnsupportedFiles() || isFilePlaintext(path) || isFileCanvas(path) || isFileFromDataloomPlugin(path) @@ -329,13 +217,13 @@ export function chunkArray(arr: T[], len: number): T[][] { export function splitCamelCase(text: string): string[] { // if no camel case found, do nothing if (!/[a-z][A-Z]/.test(text)) { - return []; + return [] } const splittedText = text .replace(/([a-z](?=[A-Z]))/g, '$1 ') .split(' ') - .filter(t => t); - return splittedText; + .filter(t => t) + return splittedText } /**