#176 - WIP tokenization of CamelCase words

Technically works, but highlighting needs a rework
This commit is contained in:
Simon Cambier
2023-02-24 12:16:40 +01:00
parent 4df09aa70c
commit 96b4ac631d
2 changed files with 10 additions and 1 deletions

View File

@@ -10,6 +10,7 @@ import { settings } from '../settings'
import {
chunkArray,
removeDiacritics,
splitCamelCase,
stringsToRegex,
stripMarkdownCharacters,
} from '../tools/utils'
@@ -25,7 +26,7 @@ const tokenize = (text: string): string[] => {
return tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
} else return tokens
} else return tokens.flatMap(splitCamelCase)
}
export class Omnisearch {

View File

@@ -307,3 +307,11 @@ export function chunkArray<T>(arr: T[], len: number): T[][] {
return chunks
}
/**
* Converts a 'fooBarBAZLorem' into ['foo', 'Bar', 'BAZ', 'Lorem]
* @param text
*/
export function splitCamelCase(text: string): string[] {
return text.replace(/([a-z](?=[A-Z]))/g, '$1 ').split(' ')
}