Files
Library/node_modules/regex/src/utils.js
2026-01-09 23:05:52 -05:00

399 lines
13 KiB
JavaScript

import {Pattern, pattern} from './pattern.js';
import {Context, forEachUnescaped, replaceUnescaped} from 'regex-utilities';
const RegexContext = {
DEFAULT: 'DEFAULT',
CHAR_CLASS: 'CHAR_CLASS',
ENCLOSED_P: 'ENCLOSED_P',
ENCLOSED_U: 'ENCLOSED_U',
GROUP_NAME: 'GROUP_NAME',
INTERVAL_QUANTIFIER: 'INTERVAL_QUANTIFIER',
INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN',
};
const CharClassContext = {
DEFAULT: 'DEFAULT',
ENCLOSED_P: 'ENCLOSED_P',
ENCLOSED_Q: 'ENCLOSED_Q',
ENCLOSED_U: 'ENCLOSED_U',
INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN',
RANGE: 'RANGE',
};
const enclosedTokenRegexContexts = new Set([
RegexContext.ENCLOSED_P,
RegexContext.ENCLOSED_U,
]);
const enclosedTokenCharClassContexts = new Set([
CharClassContext.ENCLOSED_P,
CharClassContext.ENCLOSED_Q,
CharClassContext.ENCLOSED_U,
]);
const envSupportsFlagGroups = (() => {
try {
new RegExp('(?i:)');
} catch {
return false;
}
return true;
})();
const envSupportsFlagV = (() => {
try {
new RegExp('', 'v');
} catch {
return false;
}
return true;
})();
const doublePunctuatorChars = '&!#$%*+,.:;<=>?@^`~';
const namedCapturingDelim = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`;
const capturingDelim = String.raw`\((?!\?)(?!(?<=\(\?\()DEFINE\))|${namedCapturingDelim}`;
/**
@param {string} expression
@param {number} precedingCaptures
@returns {string}
*/
function adjustNumberedBackrefs(expression, precedingCaptures) {
return replaceUnescaped(
expression,
String.raw`\\(?<num>[1-9]\d*)`,
({groups: {num}}) => `\\${+num + precedingCaptures}`,
Context.DEFAULT
);
}
// Properties of strings as of ES2024
const stringPropertyNames = [
'Basic_Emoji',
'Emoji_Keycap_Sequence',
'RGI_Emoji_Modifier_Sequence',
'RGI_Emoji_Flag_Sequence',
'RGI_Emoji_Tag_Sequence',
'RGI_Emoji_ZWJ_Sequence',
'RGI_Emoji',
].join('|');
const charClassUnionToken = new RegExp(String.raw`
\\(?: c[A-Za-z]
| p\{(?<pStrProp>${stringPropertyNames})\}
| [pP]\{[^\}]+\}
| (?<qStrProp>q)
| u(?:[A-Fa-f\d]{4}|\{[A-Fa-f\d]+\})
| x[A-Fa-f\d]{2}
| .
)
| --
| &&
| .
`.replace(/\s+/g, ''), 'gsu');
// Assumes flag v and doesn't worry about syntax errors that are caught by it
function containsCharClassUnion(charClassPattern) {
// Return `true` if it contains:
// - `\p` (lowercase only) and the name is a property of strings (case sensitive).
// - `\q`.
// - Two single-char-matching tokens in sequence.
// - One single-char-matching token followed immediately by unescaped `[`.
// - One single-char-matching token preceded immediately by unescaped `]`.
// Else, `false`.
// Ranges with `-` create a single token.
// Subtraction and intersection with `--` and `&&` create a single token.
// Supports any number of nested classes
let hasFirst = false;
let lastM;
for (const {0: m, groups} of charClassPattern.matchAll(charClassUnionToken)) {
if (groups.pStrProp || groups.qStrProp) {
return true;
}
if (m === '[' && hasFirst) {
return true;
}
if (['-', '--', '&&'].includes(m)) {
hasFirst = false;
} else if (m !== '[' && m !== ']') {
if (hasFirst || lastM === ']') {
return true;
}
hasFirst = true;
}
lastM = m;
}
return false;
}
/**
@param {string} expression
@returns {number}
*/
function countCaptures(expression) {
let num = 0;
forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT);
return num;
}
/**
Escape special characters for the given context, assuming flag v.
@param {string} str String to escape
@param {'DEFAULT' | 'CHAR_CLASS'} context `Context` option from lib `regex-utilities`
@returns {string} Escaped string
*/
function escapeV(str, context) {
if (context === Context.CHAR_CLASS) {
// Escape all double punctuators (including ^, which is special on its own in the first
// position) in case they're bordered by the same character in or outside of the escaped string
return str.replace(new RegExp(String.raw`[()\[\]{}|\\/\-${doublePunctuatorChars}]`, 'g'), '\\$&');
}
return str.replace(/[()\[\]{}|\\^$*+?.]/g, '\\$&');
}
// Look for characters that would change the meaning of subsequent tokens outside an interpolated value
function getBreakoutChar(expression, regexContext, charClassContext) {
const escapesRemoved = expression.replace(/\\./gsu, '');
// Trailing unescaped `\`; checking `.includes('\\')` would also work
if (escapesRemoved.endsWith('\\')) {
return '\\';
}
if (regexContext === RegexContext.DEFAULT) {
// Unbalanced `[` or `]` are also errors but don't breakout; they're caught by the wrapper
return getUnbalancedChar(escapesRemoved, '(', ')');
} else if (
regexContext === RegexContext.CHAR_CLASS &&
!enclosedTokenCharClassContexts.has(charClassContext)
) {
return getUnbalancedChar(escapesRemoved, '[', ']');
} else if (
regexContext === RegexContext.INTERVAL_QUANTIFIER ||
enclosedTokenRegexContexts.has(regexContext) ||
enclosedTokenCharClassContexts.has(charClassContext)
) {
if (escapesRemoved.includes('}')) {
return '}';
}
} else if (regexContext === RegexContext.GROUP_NAME) {
if (escapesRemoved.includes('>')) {
return '>';
}
}
return '';
}
const contextToken = new RegExp(String.raw`
(?<groupN>\(\?<(?![=!])|\\[gk]<)
| (?<enclosedPU>\\[pPu]\{)
| (?<enclosedQ>\\q\{)
| (?<intervalQ>\{)
| (?<incompleteT>\\(?: $
| c(?![A-Za-z])
| u(?![A-Fa-f\d]{4})[A-Fa-f\d]{0,3}
| x(?![A-Fa-f\d]{2})[A-Fa-f\d]?
)
)
| --
| \\?.
`.replace(/\s+/g, ''), 'gsu');
/**
@typedef {{
regexContext: string;
charClassContext: string;
charClassDepth: number;
lastPos: number;
}} RunningContext
*/
/**
Accepts and returns its full state so it doesn't have to reprocess parts that have already been
seen. Assumes flag v and doesn't worry about syntax errors that are caught by it.
@param {string} incompleteExpression
@param {Partial<RunningContext>} [runningContext]
@returns {RunningContext}
*/
function getEndContextForIncompleteExpression(incompleteExpression, runningContext) {
let {regexContext, charClassContext, charClassDepth, lastPos} = {
regexContext: RegexContext.DEFAULT,
charClassContext: CharClassContext.DEFAULT,
charClassDepth: 0,
lastPos: 0,
...runningContext,
};
contextToken.lastIndex = lastPos;
let match;
while (match = contextToken.exec(incompleteExpression)) {
const {0: m, groups: {groupN, enclosedPU, enclosedQ, intervalQ, incompleteT}} = match;
if (m === '[') {
charClassDepth++;
regexContext = RegexContext.CHAR_CLASS;
charClassContext = CharClassContext.DEFAULT;
} else if (m === ']' && regexContext === RegexContext.CHAR_CLASS) {
if (charClassDepth) {
charClassDepth--;
}
if (!charClassDepth) {
regexContext = RegexContext.DEFAULT;
}
charClassContext = CharClassContext.DEFAULT;
} else if (regexContext === RegexContext.CHAR_CLASS) {
if (incompleteT) {
charClassContext = CharClassContext.INVALID_INCOMPLETE_TOKEN;
} else if (m === '-') {
charClassContext = CharClassContext.RANGE;
} else if (enclosedPU) {
charClassContext = m[1] === 'u' ? CharClassContext.ENCLOSED_U : CharClassContext.ENCLOSED_P;
} else if (enclosedQ) {
charClassContext = CharClassContext.ENCLOSED_Q;
} else if (
(m === '}' && enclosedTokenCharClassContexts.has(charClassContext)) ||
// Don't continue in these contexts since we've advanced another token
charClassContext === CharClassContext.INVALID_INCOMPLETE_TOKEN ||
charClassContext === CharClassContext.RANGE
) {
charClassContext = CharClassContext.DEFAULT;
}
} else {
if (incompleteT) {
regexContext = RegexContext.INVALID_INCOMPLETE_TOKEN;
} else if (groupN) {
regexContext = RegexContext.GROUP_NAME;
} else if (enclosedPU) {
regexContext = m[1] === 'u' ? RegexContext.ENCLOSED_U : RegexContext.ENCLOSED_P;
} else if (intervalQ) {
regexContext = RegexContext.INTERVAL_QUANTIFIER;
} else if (
(m === '>' && regexContext === RegexContext.GROUP_NAME) ||
(m === '}' && (regexContext === RegexContext.INTERVAL_QUANTIFIER || enclosedTokenRegexContexts.has(regexContext))) ||
// Don't continue in this context since we've advanced another token
regexContext === RegexContext.INVALID_INCOMPLETE_TOKEN
) {
regexContext = RegexContext.DEFAULT;
}
}
}
return {
regexContext,
charClassContext,
charClassDepth,
lastPos: incompleteExpression.length,
};
}
// No special handling for escaped versions of the characters
function getUnbalancedChar(expression, leftChar, rightChar) {
let numOpen = 0;
for (const [m] of expression.matchAll(new RegExp(`[${escapeV(leftChar + rightChar, Context.CHAR_CLASS)}]`, 'g'))) {
numOpen += m === leftChar ? 1 : -1;
if (numOpen < 0) {
return rightChar;
}
}
if (numOpen > 0) {
return leftChar;
}
return '';
}
/**
@typedef {import('./regex.js').InterpolatedValue} InterpolatedValue
@typedef {import('./regex.js').RawTemplate} RawTemplate
@typedef {import('./regex.js').RegexTagOptions} RegexTagOptions
@typedef {(
value: InterpolatedValue,
runningContext: RunningContext,
options: Required<RegexTagOptions>
) => {
transformed: string;
runningContext: RunningContext;
}} Preprocessor
*/
/**
Returns transformed versions of a template and substitutions, using the given preprocessor. Only
processes substitutions that are instanceof `Pattern`.
@param {RawTemplate} template
@param {ReadonlyArray<InterpolatedValue>} substitutions
@param {Preprocessor} preprocessor
@param {Required<RegexTagOptions>} options
@returns {{template: RawTemplate; substitutions: ReadonlyArray<InterpolatedValue>;}}
*/
function preprocess(template, substitutions, preprocessor, options) {
let /** @type {RawTemplate} */ newTemplate = {raw: []};
let newSubstitutions = [];
let runningContext;
template.raw.forEach((raw, i) => {
const result = preprocessor(raw, {...runningContext, lastPos: 0}, options);
newTemplate.raw.push(result.transformed);
runningContext = result.runningContext;
if (i < template.raw.length - 1) {
const substitution = substitutions[i];
if (substitution instanceof Pattern) {
const result = preprocessor(substitution, {...runningContext, lastPos: 0}, options);
newSubstitutions.push(pattern(result.transformed));
runningContext = result.runningContext;
} else {
newSubstitutions.push(substitution);
}
}
});
return {
template: newTemplate,
substitutions: newSubstitutions,
};
}
// Sandbox `^` if relevant, done so it can't change the meaning of the surrounding character class
// if we happen to be at the first position. See `sandboxLoneDoublePunctuatorChar` for more details
function sandboxLoneCharClassCaret(str) {
return str.replace(/^\^/, '\\^^');
}
// Sandbox without escaping by repeating the character and escaping only the first one. The second
// one is so that, if followed by the same symbol, the resulting double punctuator will still throw
// as expected. Details:
// - Only need to check the first position because, if it's part of an implicit union,
// interpolation handling will wrap it in nested `[…]`.
// - Can't just wrap in nested `[…]` here, since the value might be used in a range.
// - Can't add a second unescaped symbol if a lone symbol is the entire string because it might be
// followed by the same unescaped symbol outside an interpolation, and since it won't be wrapped,
// the second symbol wouldn't be sandboxed from the one following it.
function sandboxLoneDoublePunctuatorChar(str) {
return str.replace(new RegExp(`^([${doublePunctuatorChars}])(?!\\1)`), (m, _, pos) => {
return `\\${m}${pos + 1 === str.length ? '' : m}`;
});
}
/**
Converts `\0` tokens to `\x00` in the given context.
@param {string} str
@param {'DEFAULT' | 'CHAR_CLASS'} [context] `Context` option from lib `regex-utilities`
@returns {string}
*/
function sandboxUnsafeNulls(str, context) {
// regex`[\0${0}]` and regex`[${pattern`\0`}0]` can't be guarded against via nested `[…]`
// sandboxing in character classes if the interpolated value doesn't contain union (since it
// might be placed on a range boundary). So escape `\0` in character classes as `\x00`
return replaceUnescaped(str, String.raw`\\0(?!\d)`, '\\x00', context);
}
export {
adjustNumberedBackrefs,
capturingDelim,
CharClassContext,
containsCharClassUnion,
countCaptures,
doublePunctuatorChars,
enclosedTokenCharClassContexts,
enclosedTokenRegexContexts,
envSupportsFlagGroups,
envSupportsFlagV,
escapeV,
getBreakoutChar,
getEndContextForIncompleteExpression,
namedCapturingDelim,
preprocess,
RegexContext,
sandboxLoneCharClassCaret,
sandboxLoneDoublePunctuatorChar,
sandboxUnsafeNulls,
};