Files
Library/node_modules/regex/src/subroutines.js
2026-01-09 23:05:52 -05:00

373 lines
14 KiB
JavaScript

import {capturingDelim, countCaptures, namedCapturingDelim} from './utils.js';
import {incrementIfAtLeast, spliceStr} from './utils-internals.js';
import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped, replaceUnescaped} from 'regex-utilities';
/**
@param {string} expression
@param {import('./regex.js').PluginData} [data]
@returns {import('./regex.js').PluginResult}
*/
function subroutines(expression, data) {
// NOTE: subroutines and definition groups fully support numbered backreferences and unnamed
// captures (from interpolated regexes or from turning implicit flag n off), and all of the
// complex forward and backward backreference adjustments that can result
const namedGroups = getNamedCapturingGroups(expression, {includeContents: true});
const transformed = processSubroutines(expression, namedGroups, data?.hiddenCaptures ?? []);
return {
pattern: processDefinitionGroup(transformed.pattern, namedGroups),
hiddenCaptures: transformed.hiddenCaptures,
};
}
// Explicitly exclude `&` from subroutine name chars because it's used by extension
// `regex-recursion` for recursive subroutines via `\g<name&R=N>`
const subroutinePattern = String.raw`\\g<(?<subroutineName>[^>&]+)>`;
const token = new RegExp(String.raw`
${subroutinePattern}
| (?<capturingStart>${capturingDelim})
| \\(?<backrefNum>[1-9]\d*)
| \\k<(?<backrefName>[^>]+)>
| \\?.
`.replace(/\s+/g, ''), 'gsu');
/**
@typedef {
Map<string, {
isUnique: boolean;
contents?: string;
groupNum?: number;
numCaptures?: number;
}>} NamedCapturingGroupsMap
*/
/**
Apply transformations for subroutines: `\g<name>`.
@param {string} expression
@param {NamedCapturingGroupsMap} namedGroups
@param {Array<number>} hiddenCaptures
@returns {import('./regex.js').PluginResult}
*/
function processSubroutines(expression, namedGroups, hiddenCaptures) {
if (!/\\g</.test(expression)) {
return {
pattern: expression,
hiddenCaptures,
};
}
// Can skip a lot of processing and avoid adding captures if there are no backrefs
const hasBackrefs = hasUnescaped(expression, '\\\\(?:[1-9]|k<[^>]+>)', Context.DEFAULT);
const subroutineWrapper = hasBackrefs ? '(' : '(?:';
const openSubroutines = new Map();
const openSubroutinesStack = [];
const captureNumMap = [0];
const addedHiddenCaptures = [];
let numCapturesPassedOutsideSubroutines = 0;
let numCapturesPassedInsideSubroutines = 0;
let numCapturesPassedInsideThisSubroutine = 0;
let numSubroutineCapturesTrackedInRemap = 0;
let numCharClassesOpen = 0;
let match;
token.lastIndex = 0;
while (match = token.exec(expression)) {
const {0: m, index, groups: {subroutineName, capturingStart, backrefNum, backrefName}} = match;
if (m === '[') {
numCharClassesOpen++;
} else if (!numCharClassesOpen) {
if (subroutineName) {
if (!namedGroups.has(subroutineName)) {
throw new Error(`Invalid named capture referenced by subroutine ${m}`);
}
if (openSubroutines.has(subroutineName)) {
throw new Error(`Subroutine ${m} followed a recursive reference`);
}
const contents = namedGroups.get(subroutineName).contents;
// Wrap value in case it has top-level alternation or is followed by a quantifier. The
// wrapper also marks the end of the expanded contents, which we'll track using
// `unclosedGroupCount`. If there are any backrefs in the expression, wrap with `()`
// instead of `(?:)` in case there are backrefs inside the subroutine that refer to their
// containing capturing group
const subroutineValue = `${subroutineWrapper}${contents})`;
if (hasBackrefs) {
numCapturesPassedInsideThisSubroutine = 0;
numCapturesPassedInsideSubroutines++;
updateHiddenCaptureTracking(
hiddenCaptures,
addedHiddenCaptures,
numCapturesPassedOutsideSubroutines + numCapturesPassedInsideSubroutines
);
}
openSubroutines.set(subroutineName, {
// Incrementally decremented to track when we've left the group
unclosedGroupCount: countOpenParens(subroutineValue),
});
openSubroutinesStack.push(subroutineName);
// Expand the subroutine's contents into the pattern we're looping over
expression = spliceStr(expression, index, m, subroutineValue);
token.lastIndex -= m.length - subroutineWrapper.length;
} else if (capturingStart) {
// Somewhere within an expanded subroutine
if (openSubroutines.size) {
if (hasBackrefs) {
numCapturesPassedInsideThisSubroutine++;
numCapturesPassedInsideSubroutines++;
updateHiddenCaptureTracking(
hiddenCaptures,
addedHiddenCaptures,
numCapturesPassedOutsideSubroutines + numCapturesPassedInsideSubroutines
);
}
// Named capturing group
if (m !== '(') {
// Replace named with unnamed capture. Subroutines ideally wouldn't create any new
// captures, but it can't be helped since we need any backrefs to this capture to work.
// Given that flag n prevents unnamed capture and thereby requires you to rely on named
// backrefs and `groups`, switching to unnamed essentially accomplishes not creating a
// capture. Can fully avoid capturing if there are no backrefs in the expression
expression = spliceStr(expression, index, m, subroutineWrapper);
token.lastIndex -= m.length - subroutineWrapper.length;
}
} else if (hasBackrefs) {
captureNumMap.push(
lastOf(captureNumMap) + 1 +
numCapturesPassedInsideSubroutines -
numSubroutineCapturesTrackedInRemap
);
numSubroutineCapturesTrackedInRemap = numCapturesPassedInsideSubroutines;
numCapturesPassedOutsideSubroutines++;
}
} else if ((backrefNum || backrefName) && openSubroutines.size) {
// Unify handling for named and unnamed by always using the backref num
const num = backrefNum ? +backrefNum : namedGroups.get(backrefName)?.groupNum;
let isGroupFromThisSubroutine = false;
// Search for the group in the contents of the subroutine stack
for (const s of openSubroutinesStack) {
const group = namedGroups.get(s);
if (num >= group.groupNum && num <= (group.groupNum + group.numCaptures)) {
isGroupFromThisSubroutine = true;
break;
}
}
if (isGroupFromThisSubroutine) {
const group = namedGroups.get(lastOf(openSubroutinesStack));
// Replace the backref with metadata we'll need to rewrite it later, using
// `\k<$$bNsNrNcN>` as a temporary wrapper:
// - b: The unmodified matched backref num, or the corresponding num of a named backref
// - s: The capture num of the subroutine we're most deeply nested in, including captures
// added by expanding the contents of preceding subroutines
// - r: The original capture num of the group that the subroutine we're most deeply
// nested in references, not counting the effects of subroutines
// - c: The number of captures within `r`, not counting the effects of subroutines
const subroutineNum = numCapturesPassedOutsideSubroutines + numCapturesPassedInsideSubroutines - numCapturesPassedInsideThisSubroutine;
const metadata = `\\k<$$b${num}s${subroutineNum}r${group.groupNum}c${group.numCaptures}>`;
expression = spliceStr(expression, index, m, metadata);
token.lastIndex += metadata.length - m.length;
}
} else if (m === ')') {
if (openSubroutines.size) {
const subroutine = openSubroutines.get(lastOf(openSubroutinesStack));
subroutine.unclosedGroupCount--;
if (!subroutine.unclosedGroupCount) {
openSubroutines.delete(openSubroutinesStack.pop());
}
}
}
} else if (m === ']') {
numCharClassesOpen--;
}
}
hiddenCaptures.push(...addedHiddenCaptures);
if (hasBackrefs) {
// Second pass to adjust backrefs
expression = replaceUnescaped(
expression,
String.raw`\\(?:(?<bNum>[1-9]\d*)|k<\$\$b(?<bNumSub>\d+)s(?<subNum>\d+)r(?<refNum>\d+)c(?<refCaps>\d+)>)`,
({0: m, groups: {bNum, bNumSub, subNum, refNum, refCaps}}) => {
if (bNum) {
const backrefNum = +bNum;
if (backrefNum > captureNumMap.length - 1) {
throw new Error(`Backref "${m}" greater than number of captures`);
}
return `\\${captureNumMap[backrefNum]}`;
}
const backrefNumInSubroutine = +bNumSub;
const subroutineGroupNum = +subNum;
const refGroupNum = +refNum;
const numCapturesInRef = +refCaps;
if (backrefNumInSubroutine < refGroupNum || backrefNumInSubroutine > (refGroupNum + numCapturesInRef)) {
return `\\${captureNumMap[backrefNumInSubroutine]}`;
}
return `\\${subroutineGroupNum - refGroupNum + backrefNumInSubroutine}`;
},
Context.DEFAULT
);
}
return {
pattern: expression,
hiddenCaptures,
};
}
// `(?:)` allowed because it can be added by flag x's preprocessing of whitespace and comments
const defineGroupToken = new RegExp(String.raw`${namedCapturingDelim}|\(\?:\)|(?<invalid>\\?.)`, 'gsu');
/**
Remove valid subroutine definition groups: `(?(DEFINE)…)`.
@param {string} expression
@param {NamedCapturingGroupsMap} namedGroups
IMPORTANT: Avoid using the `contents` property of `namedGroups` objects, because at this point
subroutine substitution has been performed on the corresponding substrings in `expression`
@returns {string}
*/
function processDefinitionGroup(expression, namedGroups) {
const defineMatch = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT);
if (!defineMatch) {
return expression;
}
const defineGroup = getGroup(expression, defineMatch);
if (defineGroup.afterPos < expression.length) {
// Supporting DEFINE at positions other than the end would complicate backref handling.
// NOTE: Flag x's preprocessing permits trailing whitespace and comments
throw new Error('DEFINE group allowed only at the end of a regex');
} else if (defineGroup.afterPos > expression.length) {
throw new Error('DEFINE group is unclosed');
}
let match;
defineGroupToken.lastIndex = 0;
while (match = defineGroupToken.exec(defineGroup.contents)) {
const {captureName, invalid} = match.groups;
if (captureName) {
const group = getGroup(defineGroup.contents, match);
let duplicateName;
if (!namedGroups.get(captureName).isUnique) {
duplicateName = captureName;
} else {
const nestedNamedGroups = getNamedCapturingGroups(group.contents, {includeContents: false});
for (const name of nestedNamedGroups.keys()) {
if (!namedGroups.get(name).isUnique) {
duplicateName = name;
break;
}
}
}
if (duplicateName) {
throw new Error(`Duplicate group name "${duplicateName}" within DEFINE`);
}
defineGroupToken.lastIndex = group.afterPos;
} else if (invalid) {
// Since a DEFINE group is stripped from its expression, we can't easily determine whether
// unreferenced top-level syntax within it is valid. Such syntax serves no purpose, so it's
// easiest to not allow it
throw new Error(`DEFINE group includes unsupported syntax at top level`);
}
}
return expression.slice(0, defineMatch.index);
}
/**
Counts unescaped open parens outside of character classes, regardless of group type
@param {string} expression
@returns {number}
*/
function countOpenParens(expression) {
let num = 0;
forEachUnescaped(expression, '\\(', () => num++, Context.DEFAULT);
return num;
}
/**
@param {string} expression
@param {string} groupName
@returns {number}
*/
function getCaptureNum(expression, groupName) {
let num = 0;
let pos = 0;
let match;
while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) {
const {0: m, index, groups: {captureName}} = match;
num++;
if (captureName === groupName) {
break;
}
pos = index + m.length;
}
return num;
}
/**
@param {string} expression
@param {RegExpExecArray} delimMatch
@returns {{contents: string; afterPos: number}}
*/
function getGroup(expression, delimMatch) {
const contentsStart = delimMatch.index + delimMatch[0].length;
const contents = getGroupContents(expression, contentsStart);
const afterPos = contentsStart + contents.length + 1;
return {
contents,
afterPos,
};
}
/**
@param {string} expression
@param {{includeContents: boolean}} options
@returns {NamedCapturingGroupsMap}
*/
function getNamedCapturingGroups(expression, {includeContents}) {
const namedGroups = new Map();
forEachUnescaped(
expression,
namedCapturingDelim,
({0: m, index, groups: {captureName}}) => {
// If there are duplicate capture names, subroutines refer to the first instance of the given
// group (matching the behavior of PCRE and Perl)
if (namedGroups.has(captureName)) {
namedGroups.get(captureName).isUnique = false;
} else {
const group = {isUnique: true};
if (includeContents) {
const contents = getGroupContents(expression, index + m.length);
Object.assign(group, {
contents,
groupNum: getCaptureNum(expression, captureName),
numCaptures: countCaptures(contents),
});
}
namedGroups.set(captureName, group);
}
},
Context.DEFAULT
);
return namedGroups;
}
/**
@param {Array<any>} arr
@returns {any}
*/
function lastOf(arr) {
// Remove when support for ES2022 array method `at` (Node.js 16.6) is no longer an issue:
// <https://caniuse.com/mdn-javascript_builtins_array_at>
return arr[arr.length - 1];
}
/**
@param {Array<number>} hiddenCaptures
@param {Array<number>} addedHiddenCaptures
@param {number} addedCaptureNum
*/
function updateHiddenCaptureTracking(hiddenCaptures, addedHiddenCaptures, addedCaptureNum) {
addedHiddenCaptures.push(addedCaptureNum);
incrementIfAtLeast(hiddenCaptures, addedCaptureNum);
}
export {
subroutines,
};