Correct regular expression flags scanning for non-BMP characters (#58612)

Co-authored-by: Ron Buckton <ron.buckton@microsoft.com>
This commit is contained in:
graphemecluster 2024-06-05 04:45:00 +08:00 committed by GitHub
parent 8d62e2f175
commit dc1ffb1648
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 120 additions and 33 deletions

View File

@ -282,16 +282,16 @@ const textToToken = new Map(Object.entries({
"`": SyntaxKind.BacktickToken,
}));
const charToRegExpFlag = new Map(Object.entries({
d: RegularExpressionFlags.HasIndices,
g: RegularExpressionFlags.Global,
i: RegularExpressionFlags.IgnoreCase,
m: RegularExpressionFlags.Multiline,
s: RegularExpressionFlags.DotAll,
u: RegularExpressionFlags.Unicode,
v: RegularExpressionFlags.UnicodeSets,
y: RegularExpressionFlags.Sticky,
}));
const charCodeToRegExpFlag = new Map<CharacterCodes, RegularExpressionFlags>([
[CharacterCodes.d, RegularExpressionFlags.HasIndices],
[CharacterCodes.g, RegularExpressionFlags.Global],
[CharacterCodes.i, RegularExpressionFlags.IgnoreCase],
[CharacterCodes.m, RegularExpressionFlags.Multiline],
[CharacterCodes.s, RegularExpressionFlags.DotAll],
[CharacterCodes.u, RegularExpressionFlags.Unicode],
[CharacterCodes.v, RegularExpressionFlags.UnicodeSets],
[CharacterCodes.y, RegularExpressionFlags.Sticky],
]);
const regExpFlagToFirstAvailableLanguageVersion = new Map<RegularExpressionFlags, LanguageFeatureMinimumTarget>([
[RegularExpressionFlags.HasIndices, LanguageFeatureMinimumTarget.RegularExpressionFlagsHasIndices],
@ -394,8 +394,8 @@ function isUnicodeIdentifierPart(code: number, languageVersion: ScriptTarget | u
lookupInUnicodeMap(code, unicodeES5IdentifierPart);
}
function makeReverseMap(source: Map<string, number>): string[] {
const result: string[] = [];
function makeReverseMap<T>(source: Map<T, number>): T[] {
const result: T[] = [];
source.forEach((value, name) => {
result[value] = name;
});
@ -416,16 +416,16 @@ export function stringToToken(s: string): SyntaxKind | undefined {
return textToToken.get(s);
}
const regExpFlagChars = makeReverseMap(charToRegExpFlag);
const regExpFlagCharCodes = makeReverseMap(charCodeToRegExpFlag);
/** @internal */
export function regularExpressionFlagToCharacter(f: RegularExpressionFlags): string | undefined {
return regExpFlagChars[f];
export function regularExpressionFlagToCharacterCode(f: RegularExpressionFlags): CharacterCodes | undefined {
return regExpFlagCharCodes[f];
}
/** @internal */
export function characterToRegularExpressionFlag(c: string): RegularExpressionFlags | undefined {
return charToRegExpFlag.get(c);
export function characterCodeToRegularExpressionFlag(ch: CharacterCodes): RegularExpressionFlags | undefined {
return charCodeToRegExpFlag.get(ch);
}
/** @internal */
@ -2558,27 +2558,28 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
pos++;
let regExpFlags = RegularExpressionFlags.None;
while (true) {
const ch = charCodeChecked(pos);
const ch = codePointChecked(pos);
if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) {
break;
}
const size = charSize(ch);
if (reportErrors) {
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
const flag = characterCodeToRegularExpressionFlag(ch);
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (regExpFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (((regExpFlags | flag) & RegularExpressionFlags.AnyUnicodeMode) === RegularExpressionFlags.AnyUnicodeMode) {
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, 1);
error(Diagnostics.The_Unicode_u_flag_and_the_Unicode_Sets_v_flag_cannot_be_set_simultaneously, pos, size);
}
else {
regExpFlags |= flag;
checkRegularExpressionFlagAvailable(flag, pos);
checkRegularExpressionFlagAvailability(flag, size);
}
}
pos++;
pos += size;
}
if (reportErrors) {
scanRange(startOfRegExpBody, endOfRegExpBody - startOfRegExpBody, () => {
@ -2843,25 +2844,26 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
function scanPatternModifiers(currFlags: RegularExpressionFlags): RegularExpressionFlags {
while (true) {
const ch = charCodeChecked(pos);
const ch = codePointChecked(pos);
if (ch === CharacterCodes.EOF || !isIdentifierPart(ch, languageVersion)) {
break;
}
const flag = characterToRegularExpressionFlag(String.fromCharCode(ch));
const size = charSize(ch);
const flag = characterCodeToRegularExpressionFlag(ch);
if (flag === undefined) {
error(Diagnostics.Unknown_regular_expression_flag, pos, 1);
error(Diagnostics.Unknown_regular_expression_flag, pos, size);
}
else if (currFlags & flag) {
error(Diagnostics.Duplicate_regular_expression_flag, pos, 1);
error(Diagnostics.Duplicate_regular_expression_flag, pos, size);
}
else if (!(flag & RegularExpressionFlags.Modifiers)) {
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, 1);
error(Diagnostics.This_regular_expression_flag_cannot_be_toggled_within_a_subpattern, pos, size);
}
else {
currFlags |= flag;
checkRegularExpressionFlagAvailable(flag, pos);
checkRegularExpressionFlagAvailability(flag, size);
}
pos++;
pos += size;
}
return currFlags;
}
@ -3583,10 +3585,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
});
}
function checkRegularExpressionFlagAvailable(flag: RegularExpressionFlags, pos: number) {
function checkRegularExpressionFlagAvailability(flag: RegularExpressionFlags, size: number) {
const availableFrom = regExpFlagToFirstAvailableLanguageVersion.get(flag) as ScriptTarget | undefined;
if (availableFrom && languageVersion < availableFrom) {
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, 1, getNameOfScriptTarget(availableFrom));
error(Diagnostics.This_regular_expression_flag_is_only_available_when_targeting_0_or_later, pos, size, getNameOfScriptTarget(availableFrom));
}
}

View File

@ -0,0 +1,29 @@
regularExpressionWithNonBMPFlags.ts(7,23): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,25): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,28): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,41): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,43): error TS1499: Unknown regular expression flag.
regularExpressionWithNonBMPFlags.ts(7,45): error TS1499: Unknown regular expression flag.
==== regularExpressionWithNonBMPFlags.ts (6 errors) ====
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.
~~
!!! error TS1499: Unknown regular expression flag.

View File

@ -0,0 +1,20 @@
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////
//// [regularExpressionWithNonBMPFlags.ts]
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
//// [regularExpressionWithNonBMPFlags.js]
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;

View File

@ -0,0 +1,12 @@
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////
=== regularExpressionWithNonBMPFlags.ts ===
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
>𝘳𝘦𝘨𝘦𝘹 : Symbol(𝘳𝘦𝘨𝘦𝘹, Decl(regularExpressionWithNonBMPFlags.ts, 6, 5))

View File

@ -0,0 +1,15 @@
//// [tests/cases/compiler/regularExpressionWithNonBMPFlags.ts] ////
=== regularExpressionWithNonBMPFlags.ts ===
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;
>𝘳𝘦𝘨𝘦𝘹 : RegExp
> : ^^^^^^
>/(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶 : RegExp
> : ^^^^^^

View File

@ -0,0 +1,9 @@
// @target: esnext
// The characters in the following regular expression are ASCII-lookalike characters found in Unicode, including:
// - 𝘴 (U+1D634 Mathematical Sans-Serif Italic Small S)
// - 𝘪 (U+1D62A Mathematical Sans-Serif Italic Small I)
// - 𝘮 (U+1D62E Mathematical Sans-Serif Italic Small M)
//
// See https://en.wikipedia.org/wiki/Mathematical_Alphanumeric_Symbols
const 𝘳𝘦𝘨𝘦𝘹 = /(?𝘴𝘪-𝘮:^𝘧𝘰𝘰.)/𝘨𝘮𝘶;