first commit
This commit is contained in:
398
node_modules/regex/src/utils.js
generated
vendored
Normal file
398
node_modules/regex/src/utils.js
generated
vendored
Normal file
@@ -0,0 +1,398 @@
|
||||
import {Pattern, pattern} from './pattern.js';
|
||||
import {Context, forEachUnescaped, replaceUnescaped} from 'regex-utilities';
|
||||
|
||||
const RegexContext = {
|
||||
DEFAULT: 'DEFAULT',
|
||||
CHAR_CLASS: 'CHAR_CLASS',
|
||||
ENCLOSED_P: 'ENCLOSED_P',
|
||||
ENCLOSED_U: 'ENCLOSED_U',
|
||||
GROUP_NAME: 'GROUP_NAME',
|
||||
INTERVAL_QUANTIFIER: 'INTERVAL_QUANTIFIER',
|
||||
INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN',
|
||||
};
|
||||
|
||||
const CharClassContext = {
|
||||
DEFAULT: 'DEFAULT',
|
||||
ENCLOSED_P: 'ENCLOSED_P',
|
||||
ENCLOSED_Q: 'ENCLOSED_Q',
|
||||
ENCLOSED_U: 'ENCLOSED_U',
|
||||
INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN',
|
||||
RANGE: 'RANGE',
|
||||
};
|
||||
|
||||
const enclosedTokenRegexContexts = new Set([
|
||||
RegexContext.ENCLOSED_P,
|
||||
RegexContext.ENCLOSED_U,
|
||||
]);
|
||||
|
||||
const enclosedTokenCharClassContexts = new Set([
|
||||
CharClassContext.ENCLOSED_P,
|
||||
CharClassContext.ENCLOSED_Q,
|
||||
CharClassContext.ENCLOSED_U,
|
||||
]);
|
||||
|
||||
const envSupportsFlagGroups = (() => {
|
||||
try {
|
||||
new RegExp('(?i:)');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
})();
|
||||
|
||||
const envSupportsFlagV = (() => {
|
||||
try {
|
||||
new RegExp('', 'v');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
})();
|
||||
|
||||
const doublePunctuatorChars = '&!#$%*+,.:;<=>?@^`~';
|
||||
const namedCapturingDelim = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`;
|
||||
const capturingDelim = String.raw`\((?!\?)(?!(?<=\(\?\()DEFINE\))|${namedCapturingDelim}`;
|
||||
|
||||
/**
|
||||
@param {string} expression
|
||||
@param {number} precedingCaptures
|
||||
@returns {string}
|
||||
*/
|
||||
function adjustNumberedBackrefs(expression, precedingCaptures) {
|
||||
return replaceUnescaped(
|
||||
expression,
|
||||
String.raw`\\(?<num>[1-9]\d*)`,
|
||||
({groups: {num}}) => `\\${+num + precedingCaptures}`,
|
||||
Context.DEFAULT
|
||||
);
|
||||
}
|
||||
|
||||
// Properties of strings as of ES2024
|
||||
const stringPropertyNames = [
|
||||
'Basic_Emoji',
|
||||
'Emoji_Keycap_Sequence',
|
||||
'RGI_Emoji_Modifier_Sequence',
|
||||
'RGI_Emoji_Flag_Sequence',
|
||||
'RGI_Emoji_Tag_Sequence',
|
||||
'RGI_Emoji_ZWJ_Sequence',
|
||||
'RGI_Emoji',
|
||||
].join('|');
|
||||
const charClassUnionToken = new RegExp(String.raw`
|
||||
\\(?: c[A-Za-z]
|
||||
| p\{(?<pStrProp>${stringPropertyNames})\}
|
||||
| [pP]\{[^\}]+\}
|
||||
| (?<qStrProp>q)
|
||||
| u(?:[A-Fa-f\d]{4}|\{[A-Fa-f\d]+\})
|
||||
| x[A-Fa-f\d]{2}
|
||||
| .
|
||||
)
|
||||
| --
|
||||
| &&
|
||||
| .
|
||||
`.replace(/\s+/g, ''), 'gsu');
|
||||
|
||||
// Assumes flag v and doesn't worry about syntax errors that are caught by it
|
||||
function containsCharClassUnion(charClassPattern) {
|
||||
// Return `true` if it contains:
|
||||
// - `\p` (lowercase only) and the name is a property of strings (case sensitive).
|
||||
// - `\q`.
|
||||
// - Two single-char-matching tokens in sequence.
|
||||
// - One single-char-matching token followed immediately by unescaped `[`.
|
||||
// - One single-char-matching token preceded immediately by unescaped `]`.
|
||||
// Else, `false`.
|
||||
// Ranges with `-` create a single token.
|
||||
// Subtraction and intersection with `--` and `&&` create a single token.
|
||||
// Supports any number of nested classes
|
||||
let hasFirst = false;
|
||||
let lastM;
|
||||
for (const {0: m, groups} of charClassPattern.matchAll(charClassUnionToken)) {
|
||||
if (groups.pStrProp || groups.qStrProp) {
|
||||
return true;
|
||||
}
|
||||
if (m === '[' && hasFirst) {
|
||||
return true;
|
||||
}
|
||||
if (['-', '--', '&&'].includes(m)) {
|
||||
hasFirst = false;
|
||||
} else if (m !== '[' && m !== ']') {
|
||||
if (hasFirst || lastM === ']') {
|
||||
return true;
|
||||
}
|
||||
hasFirst = true;
|
||||
}
|
||||
lastM = m;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
@param {string} expression
|
||||
@returns {number}
|
||||
*/
|
||||
function countCaptures(expression) {
|
||||
let num = 0;
|
||||
forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT);
|
||||
return num;
|
||||
}
|
||||
|
||||
/**
|
||||
Escape special characters for the given context, assuming flag v.
|
||||
@param {string} str String to escape
|
||||
@param {'DEFAULT' | 'CHAR_CLASS'} context `Context` option from lib `regex-utilities`
|
||||
@returns {string} Escaped string
|
||||
*/
|
||||
function escapeV(str, context) {
|
||||
if (context === Context.CHAR_CLASS) {
|
||||
// Escape all double punctuators (including ^, which is special on its own in the first
|
||||
// position) in case they're bordered by the same character in or outside of the escaped string
|
||||
return str.replace(new RegExp(String.raw`[()\[\]{}|\\/\-${doublePunctuatorChars}]`, 'g'), '\\$&');
|
||||
}
|
||||
return str.replace(/[()\[\]{}|\\^$*+?.]/g, '\\$&');
|
||||
}
|
||||
|
||||
// Look for characters that would change the meaning of subsequent tokens outside an interpolated value
|
||||
function getBreakoutChar(expression, regexContext, charClassContext) {
|
||||
const escapesRemoved = expression.replace(/\\./gsu, '');
|
||||
// Trailing unescaped `\`; checking `.includes('\\')` would also work
|
||||
if (escapesRemoved.endsWith('\\')) {
|
||||
return '\\';
|
||||
}
|
||||
if (regexContext === RegexContext.DEFAULT) {
|
||||
// Unbalanced `[` or `]` are also errors but don't breakout; they're caught by the wrapper
|
||||
return getUnbalancedChar(escapesRemoved, '(', ')');
|
||||
} else if (
|
||||
regexContext === RegexContext.CHAR_CLASS &&
|
||||
!enclosedTokenCharClassContexts.has(charClassContext)
|
||||
) {
|
||||
return getUnbalancedChar(escapesRemoved, '[', ']');
|
||||
} else if (
|
||||
regexContext === RegexContext.INTERVAL_QUANTIFIER ||
|
||||
enclosedTokenRegexContexts.has(regexContext) ||
|
||||
enclosedTokenCharClassContexts.has(charClassContext)
|
||||
) {
|
||||
if (escapesRemoved.includes('}')) {
|
||||
return '}';
|
||||
}
|
||||
} else if (regexContext === RegexContext.GROUP_NAME) {
|
||||
if (escapesRemoved.includes('>')) {
|
||||
return '>';
|
||||
}
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
const contextToken = new RegExp(String.raw`
|
||||
(?<groupN>\(\?<(?![=!])|\\[gk]<)
|
||||
| (?<enclosedPU>\\[pPu]\{)
|
||||
| (?<enclosedQ>\\q\{)
|
||||
| (?<intervalQ>\{)
|
||||
| (?<incompleteT>\\(?: $
|
||||
| c(?![A-Za-z])
|
||||
| u(?![A-Fa-f\d]{4})[A-Fa-f\d]{0,3}
|
||||
| x(?![A-Fa-f\d]{2})[A-Fa-f\d]?
|
||||
)
|
||||
)
|
||||
| --
|
||||
| \\?.
|
||||
`.replace(/\s+/g, ''), 'gsu');
|
||||
|
||||
/**
|
||||
@typedef {{
|
||||
regexContext: string;
|
||||
charClassContext: string;
|
||||
charClassDepth: number;
|
||||
lastPos: number;
|
||||
}} RunningContext
|
||||
*/
|
||||
/**
|
||||
Accepts and returns its full state so it doesn't have to reprocess parts that have already been
|
||||
seen. Assumes flag v and doesn't worry about syntax errors that are caught by it.
|
||||
@param {string} incompleteExpression
|
||||
@param {Partial<RunningContext>} [runningContext]
|
||||
@returns {RunningContext}
|
||||
*/
|
||||
function getEndContextForIncompleteExpression(incompleteExpression, runningContext) {
|
||||
let {regexContext, charClassContext, charClassDepth, lastPos} = {
|
||||
regexContext: RegexContext.DEFAULT,
|
||||
charClassContext: CharClassContext.DEFAULT,
|
||||
charClassDepth: 0,
|
||||
lastPos: 0,
|
||||
...runningContext,
|
||||
};
|
||||
contextToken.lastIndex = lastPos;
|
||||
let match;
|
||||
while (match = contextToken.exec(incompleteExpression)) {
|
||||
const {0: m, groups: {groupN, enclosedPU, enclosedQ, intervalQ, incompleteT}} = match;
|
||||
if (m === '[') {
|
||||
charClassDepth++;
|
||||
regexContext = RegexContext.CHAR_CLASS;
|
||||
charClassContext = CharClassContext.DEFAULT;
|
||||
} else if (m === ']' && regexContext === RegexContext.CHAR_CLASS) {
|
||||
if (charClassDepth) {
|
||||
charClassDepth--;
|
||||
}
|
||||
if (!charClassDepth) {
|
||||
regexContext = RegexContext.DEFAULT;
|
||||
}
|
||||
charClassContext = CharClassContext.DEFAULT;
|
||||
} else if (regexContext === RegexContext.CHAR_CLASS) {
|
||||
if (incompleteT) {
|
||||
charClassContext = CharClassContext.INVALID_INCOMPLETE_TOKEN;
|
||||
} else if (m === '-') {
|
||||
charClassContext = CharClassContext.RANGE;
|
||||
} else if (enclosedPU) {
|
||||
charClassContext = m[1] === 'u' ? CharClassContext.ENCLOSED_U : CharClassContext.ENCLOSED_P;
|
||||
} else if (enclosedQ) {
|
||||
charClassContext = CharClassContext.ENCLOSED_Q;
|
||||
} else if (
|
||||
(m === '}' && enclosedTokenCharClassContexts.has(charClassContext)) ||
|
||||
// Don't continue in these contexts since we've advanced another token
|
||||
charClassContext === CharClassContext.INVALID_INCOMPLETE_TOKEN ||
|
||||
charClassContext === CharClassContext.RANGE
|
||||
) {
|
||||
charClassContext = CharClassContext.DEFAULT;
|
||||
}
|
||||
} else {
|
||||
if (incompleteT) {
|
||||
regexContext = RegexContext.INVALID_INCOMPLETE_TOKEN;
|
||||
} else if (groupN) {
|
||||
regexContext = RegexContext.GROUP_NAME;
|
||||
} else if (enclosedPU) {
|
||||
regexContext = m[1] === 'u' ? RegexContext.ENCLOSED_U : RegexContext.ENCLOSED_P;
|
||||
} else if (intervalQ) {
|
||||
regexContext = RegexContext.INTERVAL_QUANTIFIER;
|
||||
} else if (
|
||||
(m === '>' && regexContext === RegexContext.GROUP_NAME) ||
|
||||
(m === '}' && (regexContext === RegexContext.INTERVAL_QUANTIFIER || enclosedTokenRegexContexts.has(regexContext))) ||
|
||||
// Don't continue in this context since we've advanced another token
|
||||
regexContext === RegexContext.INVALID_INCOMPLETE_TOKEN
|
||||
) {
|
||||
regexContext = RegexContext.DEFAULT;
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
regexContext,
|
||||
charClassContext,
|
||||
charClassDepth,
|
||||
lastPos: incompleteExpression.length,
|
||||
};
|
||||
}
|
||||
|
||||
// No special handling for escaped versions of the characters
|
||||
function getUnbalancedChar(expression, leftChar, rightChar) {
|
||||
let numOpen = 0;
|
||||
for (const [m] of expression.matchAll(new RegExp(`[${escapeV(leftChar + rightChar, Context.CHAR_CLASS)}]`, 'g'))) {
|
||||
numOpen += m === leftChar ? 1 : -1;
|
||||
if (numOpen < 0) {
|
||||
return rightChar;
|
||||
}
|
||||
}
|
||||
if (numOpen > 0) {
|
||||
return leftChar;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
@typedef {import('./regex.js').InterpolatedValue} InterpolatedValue
|
||||
@typedef {import('./regex.js').RawTemplate} RawTemplate
|
||||
@typedef {import('./regex.js').RegexTagOptions} RegexTagOptions
|
||||
@typedef {(
|
||||
value: InterpolatedValue,
|
||||
runningContext: RunningContext,
|
||||
options: Required<RegexTagOptions>
|
||||
) => {
|
||||
transformed: string;
|
||||
runningContext: RunningContext;
|
||||
}} Preprocessor
|
||||
*/
|
||||
/**
|
||||
Returns transformed versions of a template and substitutions, using the given preprocessor. Only
|
||||
processes substitutions that are instanceof `Pattern`.
|
||||
@param {RawTemplate} template
|
||||
@param {ReadonlyArray<InterpolatedValue>} substitutions
|
||||
@param {Preprocessor} preprocessor
|
||||
@param {Required<RegexTagOptions>} options
|
||||
@returns {{template: RawTemplate; substitutions: ReadonlyArray<InterpolatedValue>;}}
|
||||
*/
|
||||
function preprocess(template, substitutions, preprocessor, options) {
|
||||
let /** @type {RawTemplate} */ newTemplate = {raw: []};
|
||||
let newSubstitutions = [];
|
||||
let runningContext;
|
||||
template.raw.forEach((raw, i) => {
|
||||
const result = preprocessor(raw, {...runningContext, lastPos: 0}, options);
|
||||
newTemplate.raw.push(result.transformed);
|
||||
runningContext = result.runningContext;
|
||||
if (i < template.raw.length - 1) {
|
||||
const substitution = substitutions[i];
|
||||
if (substitution instanceof Pattern) {
|
||||
const result = preprocessor(substitution, {...runningContext, lastPos: 0}, options);
|
||||
newSubstitutions.push(pattern(result.transformed));
|
||||
runningContext = result.runningContext;
|
||||
} else {
|
||||
newSubstitutions.push(substitution);
|
||||
}
|
||||
}
|
||||
});
|
||||
return {
|
||||
template: newTemplate,
|
||||
substitutions: newSubstitutions,
|
||||
};
|
||||
}
|
||||
|
||||
// Sandbox `^` if relevant, done so it can't change the meaning of the surrounding character class
|
||||
// if we happen to be at the first position. See `sandboxLoneDoublePunctuatorChar` for more details
|
||||
function sandboxLoneCharClassCaret(str) {
|
||||
return str.replace(/^\^/, '\\^^');
|
||||
}
|
||||
|
||||
// Sandbox without escaping by repeating the character and escaping only the first one. The second
|
||||
// one is so that, if followed by the same symbol, the resulting double punctuator will still throw
|
||||
// as expected. Details:
|
||||
// - Only need to check the first position because, if it's part of an implicit union,
|
||||
// interpolation handling will wrap it in nested `[…]`.
|
||||
// - Can't just wrap in nested `[…]` here, since the value might be used in a range.
|
||||
// - Can't add a second unescaped symbol if a lone symbol is the entire string because it might be
|
||||
// followed by the same unescaped symbol outside an interpolation, and since it won't be wrapped,
|
||||
// the second symbol wouldn't be sandboxed from the one following it.
|
||||
function sandboxLoneDoublePunctuatorChar(str) {
|
||||
return str.replace(new RegExp(`^([${doublePunctuatorChars}])(?!\\1)`), (m, _, pos) => {
|
||||
return `\\${m}${pos + 1 === str.length ? '' : m}`;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
Converts `\0` tokens to `\x00` in the given context.
|
||||
@param {string} str
|
||||
@param {'DEFAULT' | 'CHAR_CLASS'} [context] `Context` option from lib `regex-utilities`
|
||||
@returns {string}
|
||||
*/
|
||||
function sandboxUnsafeNulls(str, context) {
|
||||
// regex`[\0${0}]` and regex`[${pattern`\0`}0]` can't be guarded against via nested `[…]`
|
||||
// sandboxing in character classes if the interpolated value doesn't contain union (since it
|
||||
// might be placed on a range boundary). So escape `\0` in character classes as `\x00`
|
||||
return replaceUnescaped(str, String.raw`\\0(?!\d)`, '\\x00', context);
|
||||
}
|
||||
|
||||
export {
|
||||
adjustNumberedBackrefs,
|
||||
capturingDelim,
|
||||
CharClassContext,
|
||||
containsCharClassUnion,
|
||||
countCaptures,
|
||||
doublePunctuatorChars,
|
||||
enclosedTokenCharClassContexts,
|
||||
enclosedTokenRegexContexts,
|
||||
envSupportsFlagGroups,
|
||||
envSupportsFlagV,
|
||||
escapeV,
|
||||
getBreakoutChar,
|
||||
getEndContextForIncompleteExpression,
|
||||
namedCapturingDelim,
|
||||
preprocess,
|
||||
RegexContext,
|
||||
sandboxLoneCharClassCaret,
|
||||
sandboxLoneDoublePunctuatorChar,
|
||||
sandboxUnsafeNulls,
|
||||
};
|
||||
Reference in New Issue
Block a user