Skip to content

Commit

Permalink
fix: Reduce the cost of regexp exclusions (#1800)
Browse files Browse the repository at this point in the history
* fix: Reduce the cost of regexp exclusions
    Fix #1775
* Create TextRange.regexp.test.ts
* Update spellCheckFile.test.ts
  • Loading branch information
Jason3S committed Sep 28, 2021
1 parent 3f9e8c6 commit 4544c25
Show file tree
Hide file tree
Showing 8 changed files with 10,358 additions and 125 deletions.
15 changes: 15 additions & 0 deletions packages/cspell-lib/src/spellCheckFile.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
} from './spellCheckFile';

const samples = Path.resolve(__dirname, '../samples');
const testFixtures = Path.resolve(__dirname, '../../../test-fixtures');
const isWindows = process.platform === 'win32';
const hasDriveLetter = /^[A-Z]:\\/;

Expand Down Expand Up @@ -175,6 +176,16 @@ describe('Validate Spell Checking Documents', () => {
expect(r).toEqual(oc(expected));
}
);
test.each`
uri | text | settings | options | expected
${f(tf('issues/issue-1775/hunspell/utf_info.hxx'))} | ${''} | ${{}} | ${{}} | ${{ checked: true, errors: undefined }}
`(
'spellCheckFile fixtures $uri $settings $options',
async ({ uri, text, settings, options, expected }: TestSpellCheckFile) => {
const r = await spellCheckDocument(d(uri, text || undefined), options, settings);
expect(r).toEqual(oc(expected));
}
);
});

describe('Validate Uri assumptions', () => {
Expand Down Expand Up @@ -244,3 +255,7 @@ function s(file: string) {
// Force lowercase drive letter if windows
return isWindows ? fixDriveLetter(p) : p;
}

function tf(file: string) {
return Path.resolve(testFixtures, file);
}
16 changes: 13 additions & 3 deletions packages/cspell-lib/src/textValidator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -257,21 +257,31 @@ function mapTextOffsetsAgainstRanges(includeRanges: TextRange.MatchRange[]): (wo
rangePos -= 1;
}

const cur = includeRanges[rangePos];
if (wordEndPos <= cur.endPos && wordStartPos >= cur.startPos) {
return [textOffset];
}

while (wordStartPos < wordEndPos) {
while (includeRanges[rangePos] && includeRanges[rangePos].endPos <= wordStartPos) {
rangePos += 1;
}
if (!includeRanges[rangePos] || wordEndPos < includeRanges[rangePos].startPos) {
if (!includeRanges[rangePos]) {
break;
}
const { startPos, endPos } = includeRanges[rangePos];
if (wordEndPos < startPos) {
break;
}
const a = Math.max(wordStartPos, startPos);
const b = Math.min(wordEndPos, endPos);
parts.push({ offset: a, text: text.slice(a - offset, b - offset) });
if (a !== b) {
parts.push({ offset: a, text: text.slice(a - offset, b - offset) });
}
wordStartPos = b;
}

return parts.filter((wo) => !!wo.text);
return parts;
};

return mapper;
Expand Down
315 changes: 315 additions & 0 deletions packages/cspell-lib/src/util/TextRange.regexp.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
import type { TextOffset } from '@cspell/cspell-types';
import * as RegPat from '../Settings/RegExpPatterns';
import { regExMatchCommonHexFormats, regExMatchUrls } from '../Settings/RegExpPatterns';
import { calculateTextDocumentOffsets } from './text';
import * as TextRange from './TextRange';

const matchUrl = regExMatchUrls.source;
const matchHexValues = regExMatchCommonHexFormats.source;

describe('Validate Text Ranges using Regexp', () => {
test('tests finding a set of matching positions', () => {
const text = sampleCode2LF;
const ranges = TextRange.findMatchingRangesForPatterns(
[
RegPat.regExMatchUrls,
RegPat.regExSpellingGuardBlock,
RegPat.regExSpellingGuardLine,
RegPat.regExSpellingGuardNext,
RegPat.regExMatchCommonHexFormats,
],
text
);
expect(rangesToText(text, ranges)).toEqual([
" 7: 14 https://www.google.com?q=typescript';",
" 8: 15 http://www.weirddomain.com?key=jdhehdjsiijdkejshaijncjfhe';",
' 9: 22 #cccd',
' 10: 19 0x5612abcd',
' 11: 28 0xbadc0ffee',
" 13: 1 const badspelling = 'disable'; // spell-checker:disable-line, yes all of it.",
" 15: 4 cspell:disable-next\nconst verybadspelling = 'disable';",
" 19: 4 spell-checker:disable\nconst unicodeHexValue = '\\uBABC';\nconst unicodeHexValue2 = '\\x{abcd}';\n\n// spell-checker:enable",
' 29: 4 spell-checker:disable */\n\n// nested disabled checker is not supported.\n\n// spell-checker:disable\n\n// nested spell-checker:enable',
' 67: 4 cSpell:disable\n\nNot checked.\n\n',
]);
});

test('tests finding a set of matching positions CRLF', () => {
const text = sampleCode2CRLF;
const ranges = TextRange.findMatchingRangesForPatterns(
[
RegPat.regExMatchUrls,
RegPat.regExSpellingGuardBlock,
RegPat.regExSpellingGuardLine,
RegPat.regExSpellingGuardNext,
RegPat.regExMatchCommonHexFormats,
],
text
);
expect(rangesToText(text, ranges)).toEqual([
" 7: 14 https://www.google.com?q=typescript';",
" 8: 15 http://www.weirddomain.com?key=jdhehdjsiijdkejshaijncjfhe';",
' 9: 22 #cccd',
' 10: 19 0x5612abcd',
' 11: 28 0xbadc0ffee',
" 13: 1 const badspelling = 'disable'; // spell-checker:disable-line, yes all of it.",
" 15: 4 cspell:disable-next\r\nconst verybadspelling = 'disable';",
" 19: 4 spell-checker:disable\r\nconst unicodeHexValue = '\\uBABC';\r\nconst unicodeHexValue2 = '\\x{abcd}';\r\n\r\n// spell-checker:enable",
' 29: 4 spell-checker:disable */\r\n\r\n// nested disabled checker is not supported.\r\n\r\n' +
'// spell-checker:disable\r\n\r\n// nested spell-checker:enable',
' 67: 4 cSpell:disable\r\n\r\nNot checked.\r\n\r\n',
]);
});

test('tests merging inclusion and exclusion patterns into an inclusion list', () => {
const text = sampleCode2LF;
const includeRanges = TextRange.findMatchingRangesForPatterns(
[RegPat.regExString, RegPat.regExPhpHereDoc, RegPat.regExCStyleComments],
text
);
const excludeRanges = TextRange.findMatchingRangesForPatterns(
[
RegPat.regExSpellingGuardBlock,
RegPat.regExSpellingGuardLine,
RegPat.regExSpellingGuardNext,
RegPat.regExMatchUrls,
RegPat.regExMatchCommonHexFormats,
],
text
);
const mergedRanges = TextRange.excludeRanges(includeRanges, excludeRanges);
expect(rangesToText(text, mergedRanges)).toEqual([
' 2: 1 /*\n * this is a comment.\n */',
" 6: 14 'some nice text goes here'",
" 7: 13 '",
" 8: 14 '",
" 9: 21 '",
" 9: 27 '",
' 14: 1 // But not this line',
' 15: 1 // ',
' 17: 1 // And not this line',
' 19: 1 // ',
' 25: 1 /* More code and comments */',
' 27: 1 // Make sure /* this works.',
' 29: 1 /* ',
' 35: 31 <--> checking is now turned on.',
' 37: 1 // This will be checked',
' 39: 1 /*\n * spell-checker:enable <-- this makes no difference because it was already turned back on.\n */',
" 43: 12 ''",
" 45: 13 ' '",
" 48: 17 'This is a single quote string. it\\'s a lot of fun.'",
' 49: 17 "How about a double quote string?"',
' 50: 24 `\ncan contain " and \'\n\n `',
' 55: 21 <<<SQL\n SELECT * FROM users WHERE id in :ids;\nSQL;',
' 59: 21 <<<"SQL"\n SELECT * FROM users WHERE id in :ids;\nSQL;',
" 63: 20 <<<'SQL'\n SELECT * FROM users WHERE id in :ids;\nSQL;",
' 67: 1 // ',
]);
});

test('tests merging inclusion and exclusion patterns into an inclusion list CRLF', () => {
const text = sampleCode2CRLF;
const includeRanges = TextRange.findMatchingRangesForPatterns(
[RegPat.regExString, RegPat.regExPhpHereDoc, RegPat.regExCStyleComments],
text
);
const excludeRanges = TextRange.findMatchingRangesForPatterns(
[
RegPat.regExSpellingGuardBlock,
RegPat.regExSpellingGuardLine,
RegPat.regExSpellingGuardNext,
RegPat.regExMatchUrls,
RegPat.regExMatchCommonHexFormats,
],
text
);
const mergedRanges = TextRange.excludeRanges(includeRanges, excludeRanges);
expect(rangesToText(text, mergedRanges)).toEqual([
' 2: 1 /*\r\n * this is a comment.\r\n */',
" 6: 14 'some nice text goes here'",
" 7: 13 '",
" 8: 14 '",
" 9: 21 '",
" 9: 27 '",
' 14: 1 // But not this line',
' 15: 1 // ',
' 17: 1 // And not this line',
' 19: 1 // ',
' 25: 1 /* More code and comments */',
' 27: 1 // Make sure /* this works.',
' 29: 1 /* ',
' 35: 31 <--> checking is now turned on.',
' 37: 1 // This will be checked',
' 39: 1 /*\r\n * spell-checker:enable <-- this makes no difference because it was already turned back on.\r\n */',
" 43: 12 ''",
" 45: 13 ' '",
" 48: 17 'This is a single quote string. it\\'s a lot of fun.'",
' 49: 17 "How about a double quote string?"',
' 50: 24 `\r\ncan contain " and \'\r\n\r\n `',
' 55: 21 <<<SQL\r\n SELECT * FROM users WHERE id in :ids;\r\nSQL;',
' 59: 21 <<<"SQL"\r\n SELECT * FROM users WHERE id in :ids;\r\nSQL;',
" 63: 20 <<<'SQL'\r\n SELECT * FROM users WHERE id in :ids;\r\nSQL;",
' 67: 1 // ',
]);
});

test('tests finding matching positions', () => {
const text = sampleCode2LF;
const urls = TextRange.findMatchingRanges(matchUrl, text);
expect(urls.length).toBe(2);

const hexRanges = TextRange.findMatchingRanges(matchHexValues, text);
expect(hexRanges.length).toBe(5);
expect(hexRanges[2].startPos).toBe(text.indexOf('0xbadc0ffee'));

const disableChecker = TextRange.findMatchingRangesForPatterns(
[RegPat.regExSpellingGuardBlock, RegPat.regExSpellingGuardLine, RegPat.regExSpellingGuardNext],
text
);
expect(disableChecker.length).toBe(5);

const hereDocs = TextRange.findMatchingRanges(RegPat.regExPhpHereDoc, text);
expect(hereDocs.length).toBe(3);

const ranges = TextRange.findMatchingRanges(RegPat.regExString, text);
expect(rangesToText(text, ranges)).toEqual([
" 6: 14 'some nice text goes here'",
" 7: 13 'https://www.google.com?q=typescript'",
" 8: 14 'http://www.weirddomain.com?key=jdhehdjsiijdkejshaijncjfhe'",
" 9: 21 '#cccd'",
" 13: 21 'disable'",
" 16: 25 'disable'",
" 20: 25 '\\uBABC'",
" 21: 26 '\\x{abcd}'",
" 43: 12 ''",
" 45: 13 ' '",
" 48: 17 'This is a single quote string. it\\'s a lot of fun.'",
' 49: 17 "How about a double quote string?"',
' 50: 24 `\ncan contain " and \'\n\n `',
' 59: 24 "SQL"',
" 63: 23 'SQL'",
]);
});

test('tests finding matching positions CRLF', () => {
const text = sampleCode2CRLF;
const urls = TextRange.findMatchingRanges(matchUrl, text);
expect(urls.length).toBe(2);

const hexRanges = TextRange.findMatchingRanges(matchHexValues, text);
expect(hexRanges.length).toBe(5);
expect(hexRanges[2].startPos).toBe(text.indexOf('0xbadc0ffee'));

const disableChecker = TextRange.findMatchingRangesForPatterns(
[RegPat.regExSpellingGuardBlock, RegPat.regExSpellingGuardLine, RegPat.regExSpellingGuardNext],
text
);
expect(disableChecker.length).toBe(5);

const hereDocs = TextRange.findMatchingRanges(RegPat.regExPhpHereDoc, text);
expect(hereDocs.length).toBe(3);

const ranges = TextRange.findMatchingRanges(RegPat.regExString, text);
expect(rangesToText(text, ranges)).toEqual([
" 6: 14 'some nice text goes here'",
" 7: 13 'https://www.google.com?q=typescript'",
" 8: 14 'http://www.weirddomain.com?key=jdhehdjsiijdkejshaijncjfhe'",
" 9: 21 '#cccd'",
" 13: 21 'disable'",
" 16: 25 'disable'",
" 20: 25 '\\uBABC'",
" 21: 26 '\\x{abcd}'",
" 43: 12 ''",
" 45: 13 ' '",
" 48: 17 'This is a single quote string. it\\'s a lot of fun.'",
' 49: 17 "How about a double quote string?"',
' 50: 24 `\r\ncan contain " and \'\r\n\r\n `',
' 59: 24 "SQL"',
" 63: 23 'SQL'",
]);
});
});

function rangesToText(text: string, ranges: TextRange.MatchRange[]): string[] {
const textOffsets: TextOffset[] = TextRange.extractRangeText(text, ranges).map((r) => ({
offset: r.startPos,
text: r.text,
}));
const offsets = calculateTextDocumentOffsets('', text, textOffsets);
return offsets.map((t) => `${t.row.toString().padStart(3)}:${t.col.toString().padStart(3)} ${t.text}`);
}

const sampleCodeSrc = `
/*
* this is a comment.\r
*/
const text = 'some nice text goes here';
const url = 'https://www.google.com?q=typescript';
const url2 = 'http://www.weirddomain.com?key=jdhehdjsiijdkejshaijncjfhe';
const cssHexValue = '#cccd';
const cHexValue = 0x5612abcd;
const cHexValueBadCoffee = 0xbadc0ffee;
const badspelling = 'disable'; // spell-checker:disable-line, yes all of it.
// But not this line
// cspell:disable-next
const verybadspelling = 'disable';
// And not this line
// spell-checker:disable
const unicodeHexValue = '\\uBABC';
const unicodeHexValue2 = '\\x{abcd}';
// spell-checker:enable
/* More code and comments */
// Make sure /* this works.
/* spell-checker:disable */
// nested disabled checker is not supported.
// spell-checker:disable
// nested spell-checker:enable <--> checking is now turned on.
// This will be checked
/*
* spell-checker:enable <-- this makes no difference because it was already turned back on.
*/
let text = '';
for (let i = 0; i < 99; ++i) {
text += ' ' + i;
}
const string1 = 'This is a single quote string. it\\'s a lot of fun.'
const string2 = "How about a double quote string?";
const templateString = \`
can contain " and '
\`;
$phpHereDocString = <<<SQL
SELECT * FROM users WHERE id in :ids;
SQL;
$phpHereDocString = <<<"SQL"
SELECT * FROM users WHERE id in :ids;
SQL;
$phpNowDocString = <<<'SQL'
SELECT * FROM users WHERE id in :ids;
SQL;
// cSpell:disable
Not checked.
`;

const sampleCode2LF = sampleCodeSrc.replace(/\r?\n/g, '\n');
const sampleCode2CRLF = sampleCode2LF.replace(/\n/g, '\r\n');
Loading

0 comments on commit 4544c25

Please sign in to comment.