Skip to content

Commit

Permalink
Report RegExp errors in grammar check, use Annex B grammar (#58295)
Browse files Browse the repository at this point in the history
  • Loading branch information
rbuckton committed Apr 24, 2024
1 parent e6ba82b commit e28ad99
Show file tree
Hide file tree
Showing 20 changed files with 184 additions and 262 deletions.
49 changes: 48 additions & 1 deletion src/compiler/checker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ import {
countWhere,
createBinaryExpressionTrampoline,
createCompilerDiagnostic,
createDetachedDiagnostic,
createDiagnosticCollection,
createDiagnosticForFileFromMessageChain,
createDiagnosticForNode,
Expand All @@ -123,6 +124,7 @@ import {
createPrinterWithRemoveCommentsNeverAsciiEscape,
createPrinterWithRemoveCommentsOmitTrailingSemicolon,
createPropertyNameNodeForIdentifierOrLiteral,
createScanner,
createSymbolTable,
createSyntacticTypeNodeBuilder,
createTextWriter,
Expand Down Expand Up @@ -937,6 +939,7 @@ import {
rangeOfTypeParameters,
ReadonlyKeyword,
reduceLeft,
RegularExpressionLiteral,
RelationComparisonResult,
relativeComplement,
removeExtension,
Expand All @@ -953,6 +956,7 @@ import {
ReverseMappedType,
sameMap,
SatisfiesExpression,
Scanner,
scanTokenAtPosition,
ScriptKind,
ScriptTarget,
Expand Down Expand Up @@ -1446,6 +1450,7 @@ export function createTypeChecker(host: TypeCheckerHost): TypeChecker {
var requestedExternalEmitHelperNames = new Set<string>();
var requestedExternalEmitHelpers: ExternalEmitHelpers;
var externalHelpersModule: Symbol;
var scanner: Scanner | undefined;

var Symbol = objectAllocator.getSymbolConstructor();
var Type = objectAllocator.getTypeConstructor();
Expand Down Expand Up @@ -31353,6 +31358,48 @@ export function createTypeChecker(host: TypeCheckerHost): TypeChecker {
}
}

function checkGrammarRegularExpressionLiteral(node: RegularExpressionLiteral) {
const sourceFile = getSourceFileOfNode(node);
if (!hasParseDiagnostics(sourceFile)) {
let lastError: DiagnosticWithLocation | undefined;
scanner ??= createScanner(ScriptTarget.ESNext, /*skipTrivia*/ true);
scanner.setScriptTarget(sourceFile.languageVersion);
scanner.setLanguageVariant(sourceFile.languageVariant);
scanner.setOnError((message, length, arg0) => {
// emulate `parseErrorAtPosition` from parser.ts
const start = scanner!.getTokenEnd();
if (message.category === DiagnosticCategory.Message && lastError && start === lastError.start && length === lastError.length) {
const error = createDetachedDiagnostic(sourceFile.fileName, sourceFile.text, start, length, message, arg0);
addRelatedInfo(lastError, error);
}
else if (!lastError || start !== lastError.start) {
lastError = createFileDiagnostic(sourceFile, start, length, message, arg0);
diagnostics.add(lastError);
}
});
scanner.setText(sourceFile.text, node.pos, node.end - node.pos);
try {
scanner.scan();
Debug.assert(scanner.reScanSlashToken(/*reportErrors*/ true) === SyntaxKind.RegularExpressionLiteral, "Expected scanner to rescan RegularExpressionLiteral");
return !!lastError;
}
finally {
scanner.setText("");
scanner.setOnError(/*onError*/ undefined);
}
}
return false;
}

function checkRegularExpressionLiteral(node: RegularExpressionLiteral) {
const nodeLinks = getNodeLinks(node);
if (!(nodeLinks.flags & NodeCheckFlags.TypeChecked)) {
nodeLinks.flags |= NodeCheckFlags.TypeChecked;
addLazyDiagnostic(() => checkGrammarRegularExpressionLiteral(node));
}
return globalRegExpType;
}

function checkSpreadExpression(node: SpreadElement, checkMode?: CheckMode): Type {
if (languageVersion < LanguageFeatureMinimumTarget.SpreadElements) {
checkExternalEmitHelpers(node, compilerOptions.downlevelIteration ? ExternalEmitHelpers.SpreadIncludes : ExternalEmitHelpers.SpreadArray);
Expand Down Expand Up @@ -39662,7 +39709,7 @@ export function createTypeChecker(host: TypeCheckerHost): TypeChecker {
case SyntaxKind.TemplateExpression:
return checkTemplateExpression(node as TemplateExpression);
case SyntaxKind.RegularExpressionLiteral:
return globalRegExpType;
return checkRegularExpressionLiteral(node as RegularExpressionLiteral);
case SyntaxKind.ArrayLiteralExpression:
return checkArrayLiteral(node as ArrayLiteralExpression, checkMode, forceTuple);
case SyntaxKind.ObjectLiteralExpression:
Expand Down
107 changes: 68 additions & 39 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ export interface Scanner {
getTokenFlags(): TokenFlags;
reScanGreaterToken(): SyntaxKind;
reScanSlashToken(): SyntaxKind;
/** @internal */
reScanSlashToken(reportErrors?: boolean): SyntaxKind; // eslint-disable-line @typescript-eslint/unified-signatures
reScanAsteriskEqualsToken(): SyntaxKind;
reScanTemplateToken(isTaggedTemplate: boolean): SyntaxKind;
/** @deprecated use {@link reScanTemplateToken}(false) */
Expand Down Expand Up @@ -1484,7 +1486,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// | [0-3] [0-7] [0-7]?
// | [4-7] [0-7]
// NonOctalDecimalEscapeSequence ::= [89]
function scanEscapeSequence(shouldEmitInvalidEscapeError: boolean, isRegularExpression: boolean): string {
function scanEscapeSequence(shouldEmitInvalidEscapeError: boolean, isRegularExpression: boolean | "annex-b"): string {
const start = pos;
pos++;
if (pos >= end) {
Expand Down Expand Up @@ -1523,7 +1525,9 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
tokenFlags |= TokenFlags.ContainsInvalidEscape;
if (isRegularExpression || shouldEmitInvalidEscapeError) {
const code = parseInt(text.substring(start + 1, pos), 8);
error(Diagnostics.Octal_escape_sequences_are_not_allowed_Use_the_syntax_0, start, pos - start, "\\x" + code.toString(16).padStart(2, "0"));
if (isRegularExpression !== "annex-b") {
error(Diagnostics.Octal_escape_sequences_are_not_allowed_Use_the_syntax_0, start, pos - start, "\\x" + code.toString(16).padStart(2, "0"));
}
return String.fromCharCode(code);
}
return text.substring(start, pos);
Expand Down Expand Up @@ -1559,7 +1563,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
) {
// '\u{DDDDDD}'
pos -= 2;
return scanExtendedUnicodeEscape(isRegularExpression || shouldEmitInvalidEscapeError);
return scanExtendedUnicodeEscape(!!isRegularExpression || shouldEmitInvalidEscapeError);
}
// '\uDDDD'
for (; pos < start + 6; pos++) {
Expand Down Expand Up @@ -1623,7 +1627,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
case CharacterCodes.paragraphSeparator:
return "";
default:
if (isRegularExpression && (shouldEmitInvalidEscapeError || isIdentifierPart(ch, languageVersion))) {
if (isRegularExpression === true && (shouldEmitInvalidEscapeError || isIdentifierPart(ch, languageVersion))) {
error(Diagnostics.This_character_cannot_be_escaped_in_a_regular_expression, pos - 2, 2);
}
return String.fromCharCode(ch);
Expand Down Expand Up @@ -2386,7 +2390,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
return token = SyntaxKind.EqualsToken;
}

function reScanSlashToken(): SyntaxKind {
function reScanSlashToken(reportErrors?: boolean): SyntaxKind {
if (token === SyntaxKind.SlashToken || token === SyntaxKind.SlashEqualsToken) {
// Quickly get to the end of regex such that we know the flags
let p = tokenStart + 1;
Expand Down Expand Up @@ -2444,44 +2448,57 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
if (!isIdentifierPart(ch, languageVersion)) {
break;
}
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, p, 1);
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, p, 1);
}
else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, 1);
}
else {
regExpFlags |= flag;
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag)!;
if (languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, p, 1, getNameOfScriptTarget(availableFrom));
if (reportErrors) {
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, p, 1);
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, p, 1);
}
else if (((regExpFlags | flag) & RegularExpressionFlags.UnicodeMode) === RegularExpressionFlags.UnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, p, 1);
}
else {
regExpFlags |= flag;
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag)!;
if (languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, p, 1, getNameOfScriptTarget(availableFrom));
}
}
}
p++;
}
pos = tokenStart + 1;
const saveTokenPos = tokenStart;
const saveTokenFlags = tokenFlags;
scanRegularExpressionWorker(text, endOfBody, regExpFlags, isUnterminated);
if (!isUnterminated) {
if (reportErrors) {
pos = tokenStart + 1;
const saveTokenPos = tokenStart;
const saveTokenFlags = tokenFlags;
scanRegularExpressionWorker(text, endOfBody, regExpFlags, isUnterminated, /*annexB*/ true);
if (!isUnterminated) {
pos = p;
}
tokenStart = saveTokenPos;
tokenFlags = saveTokenFlags;
}
else {
pos = p;
}
tokenStart = saveTokenPos;
tokenFlags = saveTokenFlags;
tokenValue = text.substring(tokenStart, pos);
token = SyntaxKind.RegularExpressionLiteral;
}
return token;

function scanRegularExpressionWorker(text: string, end: number, regExpFlags: RegularExpressionFlags, isUnterminated: boolean) {
/** Grammar parameter */
const unicodeMode = !!(regExpFlags & RegularExpressionFlags.UnicodeMode);
function scanRegularExpressionWorker(text: string, end: number, regExpFlags: RegularExpressionFlags, isUnterminated: boolean, annexB: boolean) {
/** Grammar parameter */
const unicodeSetsMode = !!(regExpFlags & RegularExpressionFlags.UnicodeSets);
/** Grammar parameter */
const unicodeMode = !!(regExpFlags & RegularExpressionFlags.UnicodeMode);

if (unicodeMode) {
// Annex B treats any unicode mode as the strict syntax.
annexB = false;
}

/** @see {scanClassSetExpression} */
let mayContainStrings = false;

Expand Down Expand Up @@ -2571,7 +2588,8 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
case CharacterCodes.equals:
case CharacterCodes.exclamation:
pos++;
isPreviousTermQuantifiable = false;
// In Annex B, `(?=Disjunction)` and `(?!Disjunction)` are quantifiable
isPreviousTermQuantifiable = annexB;
break;
case CharacterCodes.lessThan:
const groupNameStart = pos;
Expand Down Expand Up @@ -2763,7 +2781,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
default:
// The scanEscapeSequence call in scanCharacterEscape must return non-empty strings
// since there must not be line breaks in a regex literal
Debug.assert(scanCharacterClassEscape() || scanDecimalEscape() || scanCharacterEscape());
Debug.assert(scanCharacterClassEscape() || scanDecimalEscape() || scanCharacterEscape(/*atomEscape*/ true));
break;
}
}
Expand All @@ -2788,7 +2806,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// IdentityEscape ::=
// | '^' | '$' | '/' | '\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
// | [~UnicodeMode] (any other non-identifier characters)
function scanCharacterEscape(): string {
function scanCharacterEscape(atomEscape: boolean): string {
Debug.assertEqual(text.charCodeAt(pos - 1), CharacterCodes.backslash);
let ch = text.charCodeAt(pos);
switch (ch) {
Expand All @@ -2802,6 +2820,15 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
if (unicodeMode) {
error(Diagnostics.c_must_be_followed_by_an_ASCII_letter, pos - 2, 2);
}
else if (atomEscape && annexB) {
// Annex B treats
//
// ExtendedAtom : `\` [lookahead = `c`]
//
// as the single character `\` when `c` isn't followed by a valid control character
pos--;
return "\\";
}
return String.fromCharCode(ch);
case CharacterCodes.caret:
case CharacterCodes.$:
Expand All @@ -2826,7 +2853,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
return "\\";
}
pos--;
return scanEscapeSequence(/*shouldEmitInvalidEscapeError*/ unicodeMode, /*isRegularExpression*/ true);
return scanEscapeSequence(/*shouldEmitInvalidEscapeError*/ unicodeMode, /*isRegularExpression*/ annexB ? "annex-b" : true);
}
}

Expand Down Expand Up @@ -2873,12 +2900,12 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
if (isClassContentExit(ch)) {
return;
}
if (!minCharacter) {
if (!minCharacter && !annexB) {
error(Diagnostics.A_character_class_range_must_not_be_bounded_by_another_character_class, minStart, pos - 1 - minStart);
}
const maxStart = pos;
const maxCharacter = scanClassAtom();
if (!maxCharacter) {
if (!maxCharacter && !annexB) {
error(Diagnostics.A_character_class_range_must_not_be_bounded_by_another_character_class, maxStart, pos - maxStart);
continue;
}
Expand Down Expand Up @@ -3208,7 +3235,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
pos++;
return String.fromCharCode(ch);
default:
return scanCharacterEscape();
return scanCharacterEscape(/*atomEscape*/ false);
}
}
else if (ch === text.charCodeAt(pos + 1)) {
Expand Down Expand Up @@ -3275,7 +3302,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
if (scanCharacterClassEscape()) {
return "";
}
return scanCharacterEscape();
return scanCharacterEscape(/*atomEscape*/ false);
}
}
else {
Expand Down Expand Up @@ -3407,7 +3434,9 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
}
});
forEach(decimalEscapes, escape => {
if (escape.value > numberOfCapturingGroups) {
// in AnnexB, if a DecimalEscape is greater than the number of capturing groups then it is treated as
// either a LegacyOctalEscapeSequence or IdentityEscape
if (!annexB && escape.value > numberOfCapturingGroups) {
if (numberOfCapturingGroups) {
error(Diagnostics.A_decimal_escape_must_refer_to_an_existent_capturing_group_There_are_only_0_capturing_groups_in_this_regular_expression, escape.pos, escape.end - escape.pos, numberOfCapturingGroups);
}
Expand Down
2 changes: 1 addition & 1 deletion src/testRunner/unittests/incrementalParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ describe("unittests:: Incremental Parser", () => {
const oldText = ts.ScriptSnapshot.fromString(source);
const newTextAndChange = withInsert(oldText, semicolonIndex, "/");

compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 4);
compareTrees(oldText, newTextAndChange.text, newTextAndChange.textChangeRange, 0);
});

it("Regular expression 2", () => {
Expand Down
7 changes: 0 additions & 7 deletions tests/baselines/reference/parserRegularExpression1.errors.txt

This file was deleted.

4 changes: 2 additions & 2 deletions tests/baselines/reference/parserRegularExpression1.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//// [tests/cases/conformance/parser/ecmascript5/RegularExpressions/parserRegularExpression1.ts] ////

//// [parserRegularExpression1.ts]
return /(#?-?\d*\.\d\w*%?)|(@?#?[\w-?]+%?)/g;
/(#?-?\d*\.\d\w*%?)|(@?#?[\w-?]+%?)/g;

//// [parserRegularExpression1.js]
return /(#?-?\d*\.\d\w*%?)|(@?#?[\w-?]+%?)/g;
/(#?-?\d*\.\d\w*%?)|(@?#?[\w-?]+%?)/g;
2 changes: 1 addition & 1 deletion tests/baselines/reference/parserRegularExpression1.symbols
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

=== parserRegularExpression1.ts ===

return /(#?-?\d*\.\d\w*%?)|(@?#?[\w-?]+%?)/g;
/(#?-?\d*\.\d\w*%?)|(@?#?[\w-?]+%?)/g;
Loading

0 comments on commit e28ad99

Please sign in to comment.