Skip to content

Commit

Permalink
fix: *minor breaking* fix issues with accents and the word splitter (#…
Browse files Browse the repository at this point in the history
…1330)

* fix: fix issues with accents and the word splitter
  • Loading branch information
Jason3S committed Jun 10, 2021
1 parent 9769944 commit 845c314
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 62 deletions.
2 changes: 1 addition & 1 deletion integration-tests/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
{
"path": "django/django",
"url": "https://github.com/django/django.git",
"commit": "ecf8af79355c8daa67722bd0de946b351f7f613d",
"commit": "8b4983cfd429e17c8092f4ff775915327effa6fa",
"args": [
"**/*.{md,py}"
]
Expand Down
97 changes: 55 additions & 42 deletions integration-tests/snapshots/django/django/snapshot.txt

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions packages/cspell-lib/src/util/text.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,21 @@ describe('Util Text', () => {
).toEqual(['Γ', 'γ', 'gamma', 'γάμμα']);
});

test.each`
text | expected
${'hello'} | ${['hello']}
${nfc('café')} | ${[nfc('café')]}
${nfd('café')} | ${[nfd('café')]}
${nfd('caféStyle')} | ${[nfd('café'), 'Style']}
${nfc('caféÁ')} | ${[nfc('café'), nfc('Á')]}
${nfd('caféÁ')} | ${[nfd('café'), nfd('Á')]}
`('extractWordsFromCode "$text"', ({ text, expected }) => {
const r = Text.extractWordsFromCode(text)
.map((wo) => wo.text)
.toArray();
expect(r).toEqual(expected);
});

test('case of Chinese characters', () => {
expect(Text.isUpperCase('携程旅行网')).toBe(false);
expect(Text.isLowerCase('携程旅行网')).toBe(false);
Expand Down Expand Up @@ -349,6 +364,14 @@ describe('Validates offset conversions', () => {
});
});

function nfc(s: string): string {
return s.normalize('NFC');
}

function nfd(s: string): string {
return s.normalize('NFD');
}

function match(regexp: RegExp, text: string): (string | number)[] {
const x = Text.matchStringToTextOffset(regexp, text)
.concatMap((t) => [t.text, t.offset])
Expand Down
2 changes: 1 addition & 1 deletion packages/cspell-lib/src/util/text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ export function cleanText(text: string): string {

export function cleanTextOffset(text: TextOffset): TextOffset {
return {
text: text.text.replace(regExIgnoreCharacters, (match: string) => ' '.repeat(match.length)),
text: cleanText(text.text),
offset: text.offset,
};
}
Expand Down
139 changes: 139 additions & 0 deletions packages/cspell-lib/src/util/textRegex.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import {
regExAccents,
regExAllLower,
regExAllUpper,
regExDanglingQuote,
regExFirstUpper,
regExSplitWords,
regExSplitWords2,
regExTrailingEndings,
} from './textRegex';

describe('Validate textRegex', () => {
// cspell:ignore CODE'ing
test.each`
text | expected
${'hello'} | ${[]}
${'CODEing'} | ${[['ing']]}
${"CODE'ing"} | ${[["'ing"]]}
${"ERROR'd"} | ${[["'d"]]}
${"ERROR's"} | ${[["'s"]]}
${'ERRORs'} | ${[['s']]}
${'ERRORes'} | ${[['es']]}
${'ERRORth'} | ${[['th']]}
${'ERRORnth'} | ${[['nth']]}
${'ERRORies'} | ${[['ies']]}
${nfc('CAFÉed')} | ${[['ed']]}
${nfd('CAFÉed')} | ${[['ed']]}
${nfd('CAFÉ’ed')} | ${[['’ed']]}
${nfd('CAFÉ’s')} | ${[['’s']]}
`('regExTrailingEndings on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = [...text.matchAll(regExTrailingEndings)].map((m) => Array.from(m));
expect(m).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${"ERROR's"} | ${[]}
${"'thing"} | ${["'"]}
${"n'cpp"} | ${["'"]}
${"s'thing"} | ${["'"]}
${"A'thing"} | ${["'"]}
${"s 'thing"} | ${["'"]}
${nfc(`é'thing`)} | ${["'"]}
${nfd(`é'thing`)} | ${["'"]}
`('regExDanglingQuote on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExDanglingQuote) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${"ERROR's"} | ${[]}
${nfc(`é'thing`)} | ${[]}
${nfd(`é'thing`)} | ${[nfd('á').replace('a', '')]}
`('regExAccents on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExAccents) ?? [];
expect([...m]).toEqual(expected);
});

// cspell:word érror
test.each`
text | expected
${'hello'} | ${[]}
${'ERROR'} | ${['ERROR']}
${'ERRORs'} | ${[]}
${nfc(`érror`).toUpperCase()} | ${[nfc('ÉRROR')]}
${nfd(`érror`).toUpperCase()} | ${[nfd('ÉRROR')]}
`('regExAllUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExAllUpper) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${['hello']}
${'ERROR'} | ${[]}
${'Errors'} | ${[]}
${nfc(`érror`)} | ${[nfc('érror')]}
${nfd(`érror`)} | ${[nfd('érror')]}
${nfc(`érror`)} | ${[nfc('érror')]}
${nfc(`café`)} | ${[nfc('café')]}
${nfd(`café`)} | ${[nfd('café')]}
`('regExAllLower on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExAllLower) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${'ERROR'} | ${[]}
${'Errors'} | ${['Errors']}
${nfc(`Érror`)} | ${[nfc('Érror')]}
${nfd(`Érror`)} | ${[nfd('Érror')]}
`('regExFirstUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExFirstUpper) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${'errorCode'} | ${[['rC', 'r', 'C']]}
${nfc('caféStyle')} | ${[[nfc('éS'), nfc('é'), 'S']]}
${nfd('caféStyle')} | ${[[nfd('éS'), nfd('é'), 'S']]}
${'Errors'} | ${[]}
`('regExSplitWords on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = [...text.matchAll(regExSplitWords)].map((m) => Array.from(m));
expect(m).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${'ERRORCode'} | ${[['RCo', 'R', 'Co']]}
${nfc('CAFÉStyle')} | ${[[nfc('ÉSt'), nfc('É'), 'St']]}
${nfd('CAFÉStyle')} | ${[[nfd('ÉSt'), nfd('É'), 'St']]}
${nfc('CODEÉrror')} | ${[[nfc('EÉr'), 'E', nfc('Ér')]]}
${nfd('CODEÉrror')} | ${[[nfd('EÉr'), 'E', nfd('Ér')]]}
${'ERRORS'} | ${[]}
`('regExSplitWords2 on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = [...text.matchAll(regExSplitWords2)].map((m) => Array.from(m));
expect(m).toEqual(expected);
});
});

// function s(t: string, on: string | RegExp = '|'): string[] {
// return t.split(on);
// }

function nfc(s: string): string {
return s.normalize('NFC');
}

function nfd(s: string): string {
return s.normalize('NFD');
}
11 changes: 6 additions & 5 deletions packages/cspell-lib/src/util/textRegex.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
// cspell:ignore ings ning gimuy anrvtbf

export const regExLines = /.*(\r?\n|$)/g;
export const regExUpperSOrIng = /(\p{Lu}+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu;
export const regExUpperSOrIng = /([\p{Lu}\p{M}]+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu;
export const regExSplitWords = /(\p{Ll}\p{M}?)(\p{Lu})/gu;
export const regExSplitWords2 = /(\p{Lu}\p{M}?)(\p{Lu}\p{M}?\p{Ll})/gu;
export const regExWords = /\p{L}(?:(?:\\?['’])?\p{L})*/gu;
export const regExWords = /\p{L}\p{M}?(?:(?:\\?['’])?\p{L}\p{M}?)*/gu;
export const regExWordsAndDigits = /(?:\d+)?[\p{L}\p{M}_'’-](?:(?:\\?['’])?[\p{L}\p{M}\w'’-])*/gu;
export const regExIgnoreCharacters = /\p{sc=Hiragana}|\p{sc=Han}|\p{sc=Katakana}|[\u30A0-\u30FF]|[\p{sc=Hangul}]/gu;
export const regExIgnoreCharacters = /[\p{sc=Hiragana}\p{sc=Han}\p{sc=Katakana}\u30A0-\u30FF\p{sc=Hangul}]/gu;
export const regExFirstUpper = /^\p{Lu}\p{M}?\p{Ll}+$/u;
export const regExAllUpper = /^(?:\p{Lu}\p{M}?)+$/u;
export const regExAllLower = /^(?:\p{Ll}\p{M}?)+$/u;
export const regExPossibleWordBreaks = /[_-]/g;
export const regExMatchRegExParts = /^\/(.*)\/([gimuy]*)$/;
export const regExAccents = /\p{M}/gu;
export const regExEscapeCharacters = /(?<=\\)[anrvtbf]/gi;
export const regExDanglingQuote = /(?<=\P{L}\p{L}?)[']/gu;
/** Matches against leading `'` or `{single letter}'` */
export const regExDanglingQuote = /(?<=(?:^|(?!\p{M})\P{L})(?:\p{L}\p{M}?)?)[']/gu;
/** Match tailing endings after CAPS words */
export const regExTrailingEndings = /(?<=\p{Lu}{2})['’]?(?:s|d|ing[s]|ies|e[ds]|ning|th|nth)(?!\p{Ll})/gu;
export const regExTrailingEndings = /(?<=(?:\p{Lu}\p{M}?){2})['’]?(?:s|d|ings?|ies|e[ds]?|ning|th|nth)(?!\p{Ll})/gu;
45 changes: 32 additions & 13 deletions packages/cspell-lib/src/util/wordSplitter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ describe('Validate wordSplitter', () => {
}

// cspell:ignore CVTPD CVTSI CVTTSD words'separated'by errorcode
// cspell:word Geschäft gescha
test.each`
text | expectedWords
${'hello'} | ${[tov({ text: 'hello', offset: 155 })]}
Expand All @@ -124,6 +125,7 @@ describe('Validate wordSplitter', () => {
${'_errorcode42_one_two'} | ${splitTov('_errorcode42|one|two')}
${"words'separated'by_singleQuote"} | ${splitTov(`words'separated'by|singleQuote`)}
${"Tom's_hardware"} | ${splitTov("Tom's|hardware")}
${'Geschäft'} | ${splitTov('Geschäft')}
`('split $text', ({ text, expectedWords }: TestSplit) => {
const prefix = 'this is some';
const line = {
Expand Down Expand Up @@ -186,18 +188,22 @@ describe('Validate wordSplitter', () => {

// cspell:ignore nstatic techo n'cpp n'log refactor'd
test.each`
text | expectedWords | calls
${'static'} | ${'static'} | ${1}
${'nstatic'} | ${'static'} | ${1}
${'techo'} | ${'echo'} | ${1}
${`n'cpp`} | ${'cpp'} | ${1}
${`n'log`} | ${'log'} | ${7}
${'64-bit'} | ${'bit'} | ${1}
${'128-bit'} | ${'bit'} | ${1}
${'256-sha'} | ${'256-sha'} | ${6}
${`REFACTOR'd`} | ${'REFACTOR'} | ${2}
${`dogs'`} | ${`dogs'`} | ${2}
${`planets’`} | ${`planets’`} | ${2}
text | expectedWords | calls
${'static'} | ${'static'} | ${1}
${'nstatic'} | ${'static'} | ${1}
${'techo'} | ${'echo'} | ${1}
${`n'cpp`} | ${'cpp'} | ${1}
${`î'cpp`} | ${'î|cpp'} | ${2}
${`îphoneStatic`} | ${'îphone|Static'} | ${2}
${`êphoneStatic`} | ${'êphone|Static'} | ${2}
${`geschäft`} | ${'geschäft'} | ${1}
${`n'log`} | ${'log'} | ${7}
${'64-bit'} | ${'bit'} | ${1}
${'128-bit'} | ${'bit'} | ${1}
${'256-sha'} | ${'256-sha'} | ${6}
${`REFACTOR'd`} | ${'REFACTOR'} | ${2}
${`dogs'`} | ${`dogs'`} | ${2}
${`planets’`} | ${`planets’`} | ${2}
`('split `$text` in doc', ({ text, expectedWords, calls }: TestSplit2) => {
const expectedWordSegments = splitTov(expectedWords);
const doc = sampleText();
Expand All @@ -220,7 +226,8 @@ describe('Validate wordSplitter', () => {
});

function has({ text }: TextOffset): boolean {
return text.length < 3 || !regHasLetters.test(text) || words.has(text) || words.has(text.toLowerCase());
const nfcText = text.normalize('NFC');
return text.length < 3 || !regHasLetters.test(text) || words.has(nfcText) || words.has(nfcText.toLowerCase());
}

function applyWordBreaks(text: TextOffset, breaks: number[]): TextOffset[] {
Expand Down Expand Up @@ -328,6 +335,9 @@ function sampleWordSet() {
CVTPD2PS
CVTTSD
echo
îphone
êphone
Geschäft
error codes
hello
MOVSX_r_rm16
Expand Down Expand Up @@ -373,5 +383,14 @@ function sampleText() {
128-bit values
î'cpp
îphoneStatic
geschäft
êphoneStatic
`;
}

// cspell:ignore êphone îphone geschäft

0 comments on commit 845c314

Please sign in to comment.