fix: *minor breaking* fix issues with accents and the word splitter (#…

…1330) * fix: fix issues with accents and the word splitter
streetsidesoftware · Jun 10, 2021 · 845c314 · 845c314
1 parent 9769944
commit 845c314
Show file tree

Hide file tree

Showing 7 changed files with 257 additions and 62 deletions.
diff --git a/integration-tests/config/config.json b/integration-tests/config/config.json
@@ -191,7 +191,7 @@
     {
       "path": "django/django",
       "url": "https://github.com/django/django.git",
-      "commit": "ecf8af79355c8daa67722bd0de946b351f7f613d",
+      "commit": "8b4983cfd429e17c8092f4ff775915327effa6fa",
       "args": [
         "**/*.{md,py}"
       ]

diff --git a/integration-tests/snapshots/django/django/snapshot.txt b/integration-tests/snapshots/django/django/snapshot.txt
diff --git a/packages/cspell-lib/src/util/text.test.ts b/packages/cspell-lib/src/util/text.test.ts
@@ -196,6 +196,21 @@ describe('Util Text', () => {
         ).toEqual(['Γ', 'γ', 'gamma', 'γάμμα']);
     });
 
+    test.each`
+        text                | expected
+        ${'hello'}          | ${['hello']}
+        ${nfc('café')}      | ${[nfc('café')]}
+        ${nfd('café')}      | ${[nfd('café')]}
+        ${nfd('caféStyle')} | ${[nfd('café'), 'Style']}
+        ${nfc('caféÁ')}     | ${[nfc('café'), nfc('Á')]}
+        ${nfd('caféÁ')}     | ${[nfd('café'), nfd('Á')]}
+    `('extractWordsFromCode "$text"', ({ text, expected }) => {
+        const r = Text.extractWordsFromCode(text)
+            .map((wo) => wo.text)
+            .toArray();
+        expect(r).toEqual(expected);
+    });
+
     test('case of Chinese characters', () => {
         expect(Text.isUpperCase('携程旅行网')).toBe(false);
         expect(Text.isLowerCase('携程旅行网')).toBe(false);
@@ -349,6 +364,14 @@ describe('Validates offset conversions', () => {
     });
 });
 
+function nfc(s: string): string {
+    return s.normalize('NFC');
+}
+
+function nfd(s: string): string {
+    return s.normalize('NFD');
+}
+
 function match(regexp: RegExp, text: string): (string | number)[] {
     const x = Text.matchStringToTextOffset(regexp, text)
         .concatMap((t) => [t.text, t.offset])

diff --git a/packages/cspell-lib/src/util/text.ts b/packages/cspell-lib/src/util/text.ts
@@ -94,7 +94,7 @@ export function cleanText(text: string): string {
 
 export function cleanTextOffset(text: TextOffset): TextOffset {
     return {
-        text: text.text.replace(regExIgnoreCharacters, (match: string) => ' '.repeat(match.length)),
+        text: cleanText(text.text),
         offset: text.offset,
     };
 }

diff --git a/packages/cspell-lib/src/util/textRegex.test.ts b/packages/cspell-lib/src/util/textRegex.test.ts
@@ -0,0 +1,139 @@
+import {
+    regExAccents,
+    regExAllLower,
+    regExAllUpper,
+    regExDanglingQuote,
+    regExFirstUpper,
+    regExSplitWords,
+    regExSplitWords2,
+    regExTrailingEndings,
+} from './textRegex';
+
+describe('Validate textRegex', () => {
+    // cspell:ignore CODE'ing
+    test.each`
+        text              | expected
+        ${'hello'}        | ${[]}
+        ${'CODEing'}      | ${[['ing']]}
+        ${"CODE'ing"}     | ${[["'ing"]]}
+        ${"ERROR'd"}      | ${[["'d"]]}
+        ${"ERROR's"}      | ${[["'s"]]}
+        ${'ERRORs'}       | ${[['s']]}
+        ${'ERRORes'}      | ${[['es']]}
+        ${'ERRORth'}      | ${[['th']]}
+        ${'ERRORnth'}     | ${[['nth']]}
+        ${'ERRORies'}     | ${[['ies']]}
+        ${nfc('CAFÉed')}  | ${[['ed']]}
+        ${nfd('CAFÉed')}  | ${[['ed']]}
+        ${nfd('CAFÉ’ed')} | ${[['’ed']]}
+        ${nfd('CAFÉ’s')}  | ${[['’s']]}
+    `('regExTrailingEndings on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = [...text.matchAll(regExTrailingEndings)].map((m) => Array.from(m));
+        expect(m).toEqual(expected);
+    });
+
+    test.each`
+        text              | expected
+        ${'hello'}        | ${[]}
+        ${"ERROR's"}      | ${[]}
+        ${"'thing"}       | ${["'"]}
+        ${"n'cpp"}        | ${["'"]}
+        ${"s'thing"}      | ${["'"]}
+        ${"A'thing"}      | ${["'"]}
+        ${"s 'thing"}     | ${["'"]}
+        ${nfc(`é'thing`)} | ${["'"]}
+        ${nfd(`é'thing`)} | ${["'"]}
+    `('regExDanglingQuote on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = text.match(regExDanglingQuote) ?? [];
+        expect([...m]).toEqual(expected);
+    });
+
+    test.each`
+        text              | expected
+        ${'hello'}        | ${[]}
+        ${"ERROR's"}      | ${[]}
+        ${nfc(`é'thing`)} | ${[]}
+        ${nfd(`é'thing`)} | ${[nfd('á').replace('a', '')]}
+    `('regExAccents on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = text.match(regExAccents) ?? [];
+        expect([...m]).toEqual(expected);
+    });
+
+    // cspell:word érror
+    test.each`
+        text                          | expected
+        ${'hello'}                    | ${[]}
+        ${'ERROR'}                    | ${['ERROR']}
+        ${'ERRORs'}                   | ${[]}
+        ${nfc(`érror`).toUpperCase()} | ${[nfc('ÉRROR')]}
+        ${nfd(`érror`).toUpperCase()} | ${[nfd('ÉRROR')]}
+    `('regExAllUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = text.match(regExAllUpper) ?? [];
+        expect([...m]).toEqual(expected);
+    });
+
+    test.each`
+        text            | expected
+        ${'hello'}      | ${['hello']}
+        ${'ERROR'}      | ${[]}
+        ${'Errors'}     | ${[]}
+        ${nfc(`érror`)} | ${[nfc('érror')]}
+        ${nfd(`érror`)} | ${[nfd('érror')]}
+        ${nfc(`érror`)} | ${[nfc('érror')]}
+        ${nfc(`café`)}  | ${[nfc('café')]}
+        ${nfd(`café`)}  | ${[nfd('café')]}
+    `('regExAllLower on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = text.match(regExAllLower) ?? [];
+        expect([...m]).toEqual(expected);
+    });
+
+    test.each`
+        text            | expected
+        ${'hello'}      | ${[]}
+        ${'ERROR'}      | ${[]}
+        ${'Errors'}     | ${['Errors']}
+        ${nfc(`Érror`)} | ${[nfc('Érror')]}
+        ${nfd(`Érror`)} | ${[nfd('Érror')]}
+    `('regExFirstUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = text.match(regExFirstUpper) ?? [];
+        expect([...m]).toEqual(expected);
+    });
+
+    test.each`
+        text                | expected
+        ${'hello'}          | ${[]}
+        ${'errorCode'}      | ${[['rC', 'r', 'C']]}
+        ${nfc('caféStyle')} | ${[[nfc('éS'), nfc('é'), 'S']]}
+        ${nfd('caféStyle')} | ${[[nfd('éS'), nfd('é'), 'S']]}
+        ${'Errors'}         | ${[]}
+    `('regExSplitWords on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = [...text.matchAll(regExSplitWords)].map((m) => Array.from(m));
+        expect(m).toEqual(expected);
+    });
+
+    test.each`
+        text                | expected
+        ${'hello'}          | ${[]}
+        ${'ERRORCode'}      | ${[['RCo', 'R', 'Co']]}
+        ${nfc('CAFÉStyle')} | ${[[nfc('ÉSt'), nfc('É'), 'St']]}
+        ${nfd('CAFÉStyle')} | ${[[nfd('ÉSt'), nfd('É'), 'St']]}
+        ${nfc('CODEÉrror')} | ${[[nfc('EÉr'), 'E', nfc('Ér')]]}
+        ${nfd('CODEÉrror')} | ${[[nfd('EÉr'), 'E', nfd('Ér')]]}
+        ${'ERRORS'}         | ${[]}
+    `('regExSplitWords2 on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
+        const m = [...text.matchAll(regExSplitWords2)].map((m) => Array.from(m));
+        expect(m).toEqual(expected);
+    });
+});
+
+// function s(t: string, on: string | RegExp = '|'): string[] {
+//     return t.split(on);
+// }
+
+function nfc(s: string): string {
+    return s.normalize('NFC');
+}
+
+function nfd(s: string): string {
+    return s.normalize('NFD');
+}
diff --git a/packages/cspell-lib/src/util/textRegex.ts b/packages/cspell-lib/src/util/textRegex.ts
@@ -1,19 +1,20 @@
 // cspell:ignore ings ning gimuy anrvtbf
 
 export const regExLines = /.*(\r?\n|$)/g;
-export const regExUpperSOrIng = /(\p{Lu}+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu;
+export const regExUpperSOrIng = /([\p{Lu}\p{M}]+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu;
 export const regExSplitWords = /(\p{Ll}\p{M}?)(\p{Lu})/gu;
 export const regExSplitWords2 = /(\p{Lu}\p{M}?)(\p{Lu}\p{M}?\p{Ll})/gu;
-export const regExWords = /\p{L}(?:(?:\\?['’])?\p{L})*/gu;
+export const regExWords = /\p{L}\p{M}?(?:(?:\\?['’])?\p{L}\p{M}?)*/gu;
 export const regExWordsAndDigits = /(?:\d+)?[\p{L}\p{M}_'’-](?:(?:\\?['’])?[\p{L}\p{M}\w'’-])*/gu;
-export const regExIgnoreCharacters = /\p{sc=Hiragana}|\p{sc=Han}|\p{sc=Katakana}|[\u30A0-\u30FF]|[\p{sc=Hangul}]/gu;
+export const regExIgnoreCharacters = /[\p{sc=Hiragana}\p{sc=Han}\p{sc=Katakana}\u30A0-\u30FF\p{sc=Hangul}]/gu;
 export const regExFirstUpper = /^\p{Lu}\p{M}?\p{Ll}+$/u;
 export const regExAllUpper = /^(?:\p{Lu}\p{M}?)+$/u;
 export const regExAllLower = /^(?:\p{Ll}\p{M}?)+$/u;
 export const regExPossibleWordBreaks = /[_-]/g;
 export const regExMatchRegExParts = /^\/(.*)\/([gimuy]*)$/;
 export const regExAccents = /\p{M}/gu;
 export const regExEscapeCharacters = /(?<=\\)[anrvtbf]/gi;
-export const regExDanglingQuote = /(?<=\P{L}\p{L}?)[']/gu;
+/** Matches against leading `'` or `{single letter}'` */
+export const regExDanglingQuote = /(?<=(?:^|(?!\p{M})\P{L})(?:\p{L}\p{M}?)?)[']/gu;
 /** Match tailing endings after CAPS words */
-export const regExTrailingEndings = /(?<=\p{Lu}{2})['’]?(?:s|d|ing[s]|ies|e[ds]|ning|th|nth)(?!\p{Ll})/gu;
+export const regExTrailingEndings = /(?<=(?:\p{Lu}\p{M}?){2})['’]?(?:s|d|ings?|ies|e[ds]?|ning|th|nth)(?!\p{Ll})/gu;
diff --git a/packages/cspell-lib/src/util/wordSplitter.test.ts b/packages/cspell-lib/src/util/wordSplitter.test.ts
@@ -109,6 +109,7 @@ describe('Validate wordSplitter', () => {
     }
 
     // cspell:ignore CVTPD CVTSI CVTTSD words'separated'by errorcode
+    // cspell:word Geschäft gescha
     test.each`
         text                                | expectedWords
         ${'hello'}                          | ${[tov({ text: 'hello', offset: 155 })]}
@@ -124,6 +125,7 @@ describe('Validate wordSplitter', () => {
         ${'_errorcode42_one_two'}           | ${splitTov('_errorcode42|one|two')}
         ${"words'separated'by_singleQuote"} | ${splitTov(`words'separated'by|singleQuote`)}
         ${"Tom's_hardware"}                 | ${splitTov("Tom's|hardware")}
+        ${'Geschäft'}                       | ${splitTov('Geschäft')}
     `('split $text', ({ text, expectedWords }: TestSplit) => {
         const prefix = 'this is some';
         const line = {
@@ -186,18 +188,22 @@ describe('Validate wordSplitter', () => {
 
     // cspell:ignore nstatic techo n'cpp n'log refactor'd
     test.each`
-        text            | expectedWords | calls
-        ${'static'}     | ${'static'}   | ${1}
-        ${'nstatic'}    | ${'static'}   | ${1}
-        ${'techo'}      | ${'echo'}     | ${1}
-        ${`n'cpp`}      | ${'cpp'}      | ${1}
-        ${`n'log`}      | ${'log'}      | ${7}
-        ${'64-bit'}     | ${'bit'}      | ${1}
-        ${'128-bit'}    | ${'bit'}      | ${1}
-        ${'256-sha'}    | ${'256-sha'}  | ${6}
-        ${`REFACTOR'd`} | ${'REFACTOR'} | ${2}
-        ${`dogs'`}      | ${`dogs'`}    | ${2}
-        ${`planets’`}   | ${`planets’`} | ${2}
+        text              | expectedWords      | calls
+        ${'static'}       | ${'static'}        | ${1}
+        ${'nstatic'}      | ${'static'}        | ${1}
+        ${'techo'}        | ${'echo'}          | ${1}
+        ${`n'cpp`}        | ${'cpp'}           | ${1}
+        ${`î'cpp`}        | ${'î|cpp'}         | ${2}
+        ${`îphoneStatic`} | ${'îphone|Static'} | ${2}
+        ${`êphoneStatic`} | ${'êphone|Static'} | ${2}
+        ${`geschäft`}     | ${'geschäft'}      | ${1}
+        ${`n'log`}        | ${'log'}           | ${7}
+        ${'64-bit'}       | ${'bit'}           | ${1}
+        ${'128-bit'}      | ${'bit'}           | ${1}
+        ${'256-sha'}      | ${'256-sha'}       | ${6}
+        ${`REFACTOR'd`}   | ${'REFACTOR'}      | ${2}
+        ${`dogs'`}        | ${`dogs'`}         | ${2}
+        ${`planets’`}     | ${`planets’`}      | ${2}
     `('split `$text` in doc', ({ text, expectedWords, calls }: TestSplit2) => {
         const expectedWordSegments = splitTov(expectedWords);
         const doc = sampleText();
@@ -220,7 +226,8 @@ describe('Validate wordSplitter', () => {
 });
 
 function has({ text }: TextOffset): boolean {
-    return text.length < 3 || !regHasLetters.test(text) || words.has(text) || words.has(text.toLowerCase());
+    const nfcText = text.normalize('NFC');
+    return text.length < 3 || !regHasLetters.test(text) || words.has(nfcText) || words.has(nfcText.toLowerCase());
 }
 
 function applyWordBreaks(text: TextOffset, breaks: number[]): TextOffset[] {
@@ -328,6 +335,9 @@ function sampleWordSet() {
     CVTPD2PS
     CVTTSD
     echo
+    îphone
+    êphone
+    Geschäft
     error codes
     hello
     MOVSX_r_rm16
@@ -373,5 +383,14 @@ function sampleText() {
 
     128-bit values
 
+    î'cpp
+    îphoneStatic
+
+    geschäft
+
+    êphoneStatic
+
 `;
 }
+
+// cspell:ignore êphone îphone geschäft