Skip to content

Commit

Permalink
feat: Enable support to dictionary alphabet and accents. (#2355)
Browse files Browse the repository at this point in the history
This feature improves the suggestions shown by the spell checker.

It allows dictionary authors to specify character sequences that should be considered similar.

When defining a dictionary it is now possible rank certain suggestions higher or lower by adding edit costs to the new section `dictionaryInformation`.


* refactor and adjust suggestion weights
* pull pipe library into trie lib for now
   - At some point this will get split out into its own library, but for now it is useful to have in trie-lib.
* refactor `mapDictionaryInfo`
* Make pipe coverage 100%
* Add initial pass at alphabet and accents
* refactor: rename pipe operators
* refactor mapDictionaryInfo
* Support `v is T` on filters.
* Reduce duplicate code
* Make progress with accent replacement.
* Add default support for accents.
* Create WeightMap when a dictionary is created.
* Increase the default cost
   - Make sure letters not in the alphabet are more expensive.
* Update api.d.ts
  • Loading branch information
Jason3S committed Jan 29, 2022
1 parent c2f1101 commit b33453b
Show file tree
Hide file tree
Showing 65 changed files with 2,538 additions and 675 deletions.
108 changes: 104 additions & 4 deletions cspell.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,32 @@
],
"type": "string"
},
"CharacterSet": {
"description": "This is a set of characters that can include `-` or `|`\n- `-` - indicates a range of characters: `a-c` => `abc`\n- `|` - is a group separator, indicating that the characters on either side are not related.",
"type": "string"
},
"CharacterSetCosts": {
"additionalProperties": false,
"properties": {
"characters": {
"$ref": "#/definitions/CharacterSet",
"description": "This is a set of characters that can include `-` or `|`\n- `-` - indicates a range of characters: `a-c` => `abc`\n- `|` - is a group separator, indicating that the characters on either side are not related."
},
"cost": {
"description": "the cost to insert / delete / replace / swap the characters in a group",
"type": "number"
},
"penalty": {
"description": "The penalty cost to apply if the accent is used. This is used to discourage",
"type": "number"
}
},
"required": [
"characters",
"cost"
],
"type": "object"
},
"CostMapDefInsDel": {
"additionalProperties": false,
"properties": {
Expand Down Expand Up @@ -365,6 +391,40 @@
"additionalProperties": false,
"description": "Use by dictionary authors to help improve the quality of suggestions given from the dictionary.\n\nAdded with `v5.16.0`.",
"properties": {
"accents": {
"anyOf": [
{
"$ref": "#/definitions/CharacterSet"
},
{
"items": {
"$ref": "#/definitions/CharacterSetCosts"
},
"type": "array"
}
],
"default": "̀-́",
"description": "The accent characters"
},
"alphabet": {
"anyOf": [
{
"$ref": "#/definitions/CharacterSet"
},
{
"items": {
"$ref": "#/definitions/CharacterSetCosts"
},
"type": "array"
}
],
"default": "a-zA-Z",
"description": "The alphabet to use."
},
"costs": {
"$ref": "#/definitions/EditCosts",
"description": "Define edit costs."
},
"hunspellInformation": {
"$ref": "#/definitions/HunspellInformation",
"description": "Used by dictionary authors"
Expand Down Expand Up @@ -405,6 +465,37 @@
],
"description": "Reference to a dictionary by name. One of:\n- {@link DictionaryRef } \n- {@link DictionaryNegRef }"
},
"EditCosts": {
"additionalProperties": false,
"properties": {
"accentCosts": {
"default": 1,
"description": "The cost to add / remove an accent This should be very cheap, it helps with fixing accent issues.",
"type": "number"
},
"baseCost": {
"default": 100,
"description": "This is the base cost for making an edit.",
"type": "number"
},
"capsCosts": {
"default": 1,
"description": "The cost to change capitalization. This should be very cheap, it helps with fixing capitalization issues.",
"type": "number"
},
"firstLetterPenalty": {
"default": 4,
"description": "The extra cost incurred for changing the first letter of a word. This value should be less than `100 - baseCost`.",
"type": "number"
},
"nonAlphabetCosts": {
"default": 110,
"description": "This is the cost for characters not in the alphabet.",
"type": "number"
}
},
"type": "object"
},
"FSPathResolvable": {
"$ref": "#/definitions/FsPath",
"description": "A File System Path.\n\nSpecial Properties:\n- `${cwd}` prefix - will be replaced with the current working directory.\n- Relative paths are relative to the configuration file."
Expand Down Expand Up @@ -447,14 +538,19 @@
"description": "The cost to add / remove an accent This should be very cheap, it helps with fixing accent issues.",
"type": "number"
},
"baseCost": {
"default": 100,
"description": "This is the base cost for making an edit.",
"type": "number"
},
"capsCosts": {
"default": 1,
"description": "The cost to change capitalization. This should be very cheap, it helps with fixing capitalization issues.",
"type": "number"
},
"firstLetterPenalty": {
"default": 4,
"description": "The extra cost incurred for changing the first letter of a word. This value should be less than `100 - tryCharCost`.",
"description": "The extra cost incurred for changing the first letter of a word. This value should be less than `100 - baseCost`.",
"type": "number"
},
"ioConvertCost": {
Expand All @@ -463,7 +559,7 @@
"type": "number"
},
"keyboardCost": {
"default": 94,
"default": 99,
"description": "The cost of replacing or swapping any adjacent keyboard characters.\n\nThis should be slightly cheaper than `tryCharCost`.",
"type": "number"
},
Expand All @@ -472,14 +568,18 @@
"description": "mapSet replacement cost is the cost to substitute one character with another from the same set.\n\nMap characters are considered very similar to each other and are often the cause of simple mistakes.",
"type": "number"
},
"nonAlphabetCosts": {
"default": 110,
"description": "This is the cost for characters not in the alphabet.",
"type": "number"
},
"replaceCosts": {
"default": 75,
"description": "The cost to substitute pairs found in the replace settings.",
"type": "number"
},
"tryCharCost": {
"default": 95,
"description": "The cost of inserting / deleting / or swapping any `tryChars`",
"description": "The cost of inserting / deleting / or swapping any `tryChars` Defaults to `baseCosts`",
"type": "number"
}
},
Expand Down
3 changes: 2 additions & 1 deletion packages/cspell-lib/api/api.d.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/// <reference types="node" />
import { Glob, CSpellSettingsWithSourceTrace, ReplaceMap, DictionaryDefinitionPreferred, DictionaryDefinitionAugmented, DictionaryDefinitionCustom, TextOffset, TextDocumentOffset, PnPSettings, ImportFileRef, CSpellUserSettings, LocaleId, CSpellSettings } from '@cspell/cspell-types';
import { Glob, CSpellSettingsWithSourceTrace, ReplaceMap, DictionaryInformation, DictionaryDefinitionPreferred, DictionaryDefinitionAugmented, DictionaryDefinitionCustom, TextOffset, TextDocumentOffset, PnPSettings, ImportFileRef, CSpellUserSettings, LocaleId, CSpellSettings } from '@cspell/cspell-types';
export * from '@cspell/cspell-types';
import { CompoundWordsMethod, SuggestionResult, SuggestionCollector, WeightMap } from 'cspell-trie-lib';
export { CompoundWordsMethod, SuggestionCollector, SuggestionResult } from 'cspell-trie-lib';
Expand Down Expand Up @@ -162,6 +162,7 @@ interface SpellingDictionaryOptions {
caseSensitive?: boolean;
noSuggest?: boolean;
weightMap?: WeightMap | undefined;
dictionaryInformation?: DictionaryInformation;
}
interface SpellingDictionary {
readonly name: string;
Expand Down
4 changes: 2 additions & 2 deletions packages/cspell-lib/src/Settings/DictionarySettings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import {
} from '../Models/CSpellSettingsInternalDef';
import { createDictionaryReferenceCollection } from './DictionaryReferenceCollection';
import { mapDictionaryInformationToWeightMap, WeightMap } from 'cspell-trie-lib';
import { DictionaryInformation } from '@cspell/cspell-types/dist/DictionaryInformation';
import { DictionaryInformation } from '@cspell/cspell-types';
import { RequireOptional, UnionFields } from '../util/types';

export type DefMapArrayItem = [string, DictionaryDefinitionInternal];
Expand Down Expand Up @@ -93,7 +93,7 @@ export function mapDictDefToInternal(
export function isDictionaryDefinitionWithSource(
d: DictionaryDefinition | DictionaryDefinitionInternalWithSource
): d is DictionaryDefinitionInternalWithSource {
return (d as DictionaryDefinitionInternalWithSource).__source !== undefined;
return d instanceof _DictionaryDefinitionInternalWithSource;
}

function determineName(filename: string, options: DictionaryDefinition): string {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { ReplaceMap } from '@cspell/cspell-types';
import type { ReplaceMap, DictionaryInformation } from '@cspell/cspell-types';
import { CompoundWordsMethod, SuggestionCollector, SuggestionResult, WeightMap } from 'cspell-trie-lib';

export { CompoundWordsMethod, SuggestionCollector, SuggestionResult } from 'cspell-trie-lib';
Expand Down Expand Up @@ -58,6 +58,7 @@ export interface SpellingDictionaryOptions {
caseSensitive?: boolean;
noSuggest?: boolean;
weightMap?: WeightMap | undefined;
dictionaryInformation?: DictionaryInformation;
}

export interface SpellingDictionary {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import { CompoundWordsMethod, SuggestionCollector, SuggestionResult } from 'cspell-trie-lib';
import { DictionaryInformation } from '@cspell/cspell-types';
import {
CompoundWordsMethod,
mapDictionaryInformationToWeightMap,
SuggestionCollector,
SuggestionResult,
WeightMap,
} from 'cspell-trie-lib';
import { genSequence } from 'gensequence';
import { isUpperCase, removeAccents, ucFirst } from '../util/text';
import { HasOptions, SearchOptions, SpellingDictionary, SuggestOptions } from './SpellingDictionary';
Expand Down Expand Up @@ -124,6 +131,10 @@ export function suggestArgsToSuggestOptions(args: SuggestArgs): SuggestOptions {
return suggestOptions;
}

export function createWFromDictionaryInformation(di: DictionaryInformation): WeightMap {
return mapDictionaryInformationToWeightMap(di);
}

export const __testMethods = {
wordSearchForms,
wordSearchFormsArray,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { SpellingDictionaryOptions } from '.';
import { DictionaryInformation } from '..';
import { createFailedToLoadDictionary, createSpellingDictionary } from './createSpellingDictionary';
import { SpellingDictionaryLoadError } from './SpellingDictionaryError';

Expand Down Expand Up @@ -67,6 +68,20 @@ describe('Validate createSpellingDictionary', () => {
]);
expect(d.suggest('geschaft', { ignoreCase: false }).map((r) => r.word)).toEqual(['Geschäft']);
});

// cspell:ignore fone failor
test.each`
word | ignoreCase | expected
${'Geschäft'} | ${false} | ${[c('Geschäft', 0)]}
${'Geschaft'} | ${false} | ${[c('Geschäft', 1)]}
${'fone'} | ${false} | ${[c('phone', 70), c('gone', 100)]}
${'failor'} | ${false} | ${[c('failure', 70), c('sailor', 100), c('failed', 175), c('fail', 200)]}
`('createSpellingDictionary with dictionaryInformation "$word" "$ignoreCase"', ({ word, ignoreCase, expected }) => {
const words = sampleWords();
const options = { ...opts(), dictionaryInformation: sampleDictionaryInformation({}) };
const d = createSpellingDictionary(words, 'test create', __filename, options);
expect(d.suggest(word, { ignoreCase, numSuggestions: 4 })).toEqual(expected);
});
});

function opts(opts: Partial<SpellingDictionaryOptions> = {}): SpellingDictionaryOptions {
Expand All @@ -75,3 +90,44 @@ function opts(opts: Partial<SpellingDictionaryOptions> = {}): SpellingDictionary
...opts,
};
}

function c(word: string, cost: number) {
return { word, cost };
}

function sampleDictionaryInformation(di: DictionaryInformation = {}): DictionaryInformation {
const d: DictionaryInformation = {
suggestionEditCosts: [
{
map: 'f(ph)(gh)|(ail)(ale)|(ur)(er)(ure)(or)',
replace: 70,
},
{
map: 'aeiou', // cspell:ignore aeiou
replace: 75,
swap: 75,
},
{
description: 'common vowel sounds.',
map: 'o(oh)(oo)|(oo)(ou)|(oa)(ou)',
replace: 65,
},
],
...di,
};
return d;
}

function sampleWords() {
return [
...['Geschäft'.normalize('NFD'), 'café', 'book', "Aujourd'hui", 'cafe'],
...['go', 'going', 'goes', 'gone'],
...['phone', 'fall', 'phones', 'phoning', 'call', 'caller', 'called'],
...['fail', 'fall', 'failed', 'failing', 'failure'],
...['enough', 'though', 'through'],
...['soup', 'soap', 'sooth', 'boot', 'boat'],
...['sail', 'sailor', 'sailing', 'sails', 'sailed'],
...['sale', 'sold', 'sales', 'selling'],
...['tale', 'tail'],
];
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { SpellingDictionaryFromTrie } from './SpellingDictionaryFromTrie';
import { SpellingDictionary, SpellingDictionaryOptions } from './SpellingDictionary';
import { SpellingDictionaryLoadError } from './SpellingDictionaryError';
import { operators } from 'gensequence';
import { createWFromDictionaryInformation } from './SpellingDictionaryMethods';

const defaultOptions: SpellingDictionaryOptions = Object.freeze({
weightMap: undefined,
Expand All @@ -18,7 +19,11 @@ export function createSpellingDictionary(
// console.log(`createSpellingDictionary ${name} ${source}`);
const words = parseDictionaryLines(wordList);
const trie = buildTrieFast(words);
return new SpellingDictionaryFromTrie(trie, name, options || defaultOptions, source);
const opts = { ...(options || defaultOptions) };
if (opts.weightMap === undefined && opts.dictionaryInformation) {
opts.weightMap = createWFromDictionaryInformation(opts.dictionaryInformation);
}
return new SpellingDictionaryFromTrie(trie, name, opts, source);
}

export function createForbiddenWordsDictionary(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Array [
"Trie",
"TrieBuilder",
"WORD_SEPARATOR",
"_pipe",
"buildTrie",
"buildTrieFast",
"consolidate",
Expand Down
3 changes: 3 additions & 0 deletions packages/cspell-trie-lib/src/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
import * as pipe from './pipe';
export * from './lib';

export const _pipe = pipe;
Loading

0 comments on commit b33453b

Please sign in to comment.