Library/node_modules/regex/src/utils.js

import {Pattern, pattern} from './pattern.js';
import {Context, forEachUnescaped, replaceUnescaped} from 'regex-utilities';

const RegexContext = {
  DEFAULT: 'DEFAULT',
  CHAR_CLASS: 'CHAR_CLASS',
  ENCLOSED_P: 'ENCLOSED_P',
  ENCLOSED_U: 'ENCLOSED_U',
  GROUP_NAME: 'GROUP_NAME',
  INTERVAL_QUANTIFIER: 'INTERVAL_QUANTIFIER',
  INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN',
};

const CharClassContext = {
  DEFAULT: 'DEFAULT',
  ENCLOSED_P: 'ENCLOSED_P',
  ENCLOSED_Q: 'ENCLOSED_Q',
  ENCLOSED_U: 'ENCLOSED_U',
  INVALID_INCOMPLETE_TOKEN: 'INVALID_INCOMPLETE_TOKEN',
  RANGE: 'RANGE',
};

const enclosedTokenRegexContexts = new Set([
  RegexContext.ENCLOSED_P,
  RegexContext.ENCLOSED_U,
]);

const enclosedTokenCharClassContexts = new Set([
  CharClassContext.ENCLOSED_P,
  CharClassContext.ENCLOSED_Q,
  CharClassContext.ENCLOSED_U,
]);

const envSupportsFlagGroups = (() => {
  try {
    new RegExp('(?i:)');
  } catch {
    return false;
  }
  return true;
})();

const envSupportsFlagV = (() => {
  try {
    new RegExp('', 'v');
  } catch {
    return false;
  }
  return true;
})();

const doublePunctuatorChars = '&!#$%*+,.:;<=>?@^`~';
const namedCapturingDelim = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`;
const capturingDelim = String.raw`\((?!\?)(?!(?<=\(\?\()DEFINE\))|${namedCapturingDelim}`;

/**
@param {string} expression
@param {number} precedingCaptures
@returns {string}
*/
function adjustNumberedBackrefs(expression, precedingCaptures) {
  return replaceUnescaped(
    expression,
    String.raw`\\(?<num>[1-9]\d*)`,
    ({groups: {num}}) => `\\${+num + precedingCaptures}`,
    Context.DEFAULT
  );
}

// Properties of strings as of ES2024
const stringPropertyNames = [
  'Basic_Emoji',
  'Emoji_Keycap_Sequence',
  'RGI_Emoji_Modifier_Sequence',
  'RGI_Emoji_Flag_Sequence',
  'RGI_Emoji_Tag_Sequence',
  'RGI_Emoji_ZWJ_Sequence',
  'RGI_Emoji',
].join('|');
const charClassUnionToken = new RegExp(String.raw`
\\(?: c[A-Za-z]
  | p\{(?<pStrProp>${stringPropertyNames})\}
  | [pP]\{[^\}]+\}
  | (?<qStrProp>q)
  | u(?:[A-Fa-f\d]{4}|\{[A-Fa-f\d]+\})
  | x[A-Fa-f\d]{2}
  | .
)
| --
| &&
| .
`.replace(/\s+/g, ''), 'gsu');

// Assumes flag v and doesn't worry about syntax errors that are caught by it
function containsCharClassUnion(charClassPattern) {
  // Return `true` if it contains:
  // - `\p` (lowercase only) and the name is a property of strings (case sensitive).
  // - `\q`.
  // - Two single-char-matching tokens in sequence.
  // - One single-char-matching token followed immediately by unescaped `[`.
  // - One single-char-matching token preceded immediately by unescaped `]`.
  // Else, `false`.
  // Ranges with `-` create a single token.
  // Subtraction and intersection with `--` and `&&` create a single token.
  // Supports any number of nested classes
  let hasFirst = false;
  let lastM;
  for (const {0: m, groups} of charClassPattern.matchAll(charClassUnionToken)) {
    if (groups.pStrProp || groups.qStrProp) {
      return true;
    }
    if (m === '[' && hasFirst) {
      return true;
    }
    if (['-', '--', '&&'].includes(m)) {
      hasFirst = false;
    } else if (m !== '[' && m !== ']') {
      if (hasFirst || lastM === ']') {
        return true;
      }
      hasFirst = true;
    }
    lastM = m;
  }
  return false;
}

/**
@param {string} expression
@returns {number}
*/
function countCaptures(expression) {
  let num = 0;
  forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT);
  return num;
}

/**
Escape special characters for the given context, assuming flag v.
@param {string} str String to escape
@param {'DEFAULT' | 'CHAR_CLASS'} context `Context` option from lib `regex-utilities`
@returns {string} Escaped string
*/
function escapeV(str, context) {
  if (context === Context.CHAR_CLASS) {
    // Escape all double punctuators (including ^, which is special on its own in the first
    // position) in case they're bordered by the same character in or outside of the escaped string
    return str.replace(new RegExp(String.raw`[()\[\]{}|\\/\-${doublePunctuatorChars}]`, 'g'), '\\$&');
  }
  return str.replace(/[()\[\]{}|\\^$*+?.]/g, '\\$&');
}

// Look for characters that would change the meaning of subsequent tokens outside an interpolated value
function getBreakoutChar(expression, regexContext, charClassContext) {
  const escapesRemoved = expression.replace(/\\./gsu, '');
  // Trailing unescaped `\`; checking `.includes('\\')` would also work
  if (escapesRemoved.endsWith('\\')) {
    return '\\';
  }
  if (regexContext === RegexContext.DEFAULT) {
    // Unbalanced `[` or `]` are also errors but don't breakout; they're caught by the wrapper
    return getUnbalancedChar(escapesRemoved, '(', ')');
  } else if (
    regexContext === RegexContext.CHAR_CLASS &&
    !enclosedTokenCharClassContexts.has(charClassContext)
  ) {
    return getUnbalancedChar(escapesRemoved, '[', ']');
  } else if (
    regexContext === RegexContext.INTERVAL_QUANTIFIER ||
    enclosedTokenRegexContexts.has(regexContext) ||
    enclosedTokenCharClassContexts.has(charClassContext)
  ) {
    if (escapesRemoved.includes('}')) {
      return '}';
    }
  } else if (regexContext === RegexContext.GROUP_NAME) {
    if (escapesRemoved.includes('>')) {
      return '>';
    }
  }
  return '';
}

const contextToken = new RegExp(String.raw`
(?<groupN>\(\?<(?![=!])|\\[gk]<)
| (?<enclosedPU>\\[pPu]\{)
| (?<enclosedQ>\\q\{)
| (?<intervalQ>\{)
| (?<incompleteT>\\(?: $
  | c(?![A-Za-z])
  | u(?![A-Fa-f\d]{4})[A-Fa-f\d]{0,3}
  | x(?![A-Fa-f\d]{2})[A-Fa-f\d]?
  )
)
| --
| \\?.
`.replace(/\s+/g, ''), 'gsu');

/**
@typedef {{
  regexContext: string;
  charClassContext: string;
  charClassDepth: number;
  lastPos: number;
}} RunningContext
*/
/**
Accepts and returns its full state so it doesn't have to reprocess parts that have already been
seen. Assumes flag v and doesn't worry about syntax errors that are caught by it.
@param {string} incompleteExpression
@param {Partial<RunningContext>} [runningContext]
@returns {RunningContext}
*/
function getEndContextForIncompleteExpression(incompleteExpression, runningContext) {
  let {regexContext, charClassContext, charClassDepth, lastPos} = {
    regexContext: RegexContext.DEFAULT,
    charClassContext: CharClassContext.DEFAULT,
    charClassDepth: 0,
    lastPos: 0,
    ...runningContext,
  };
  contextToken.lastIndex = lastPos;
  let match;
  while (match = contextToken.exec(incompleteExpression)) {
    const {0: m, groups: {groupN, enclosedPU, enclosedQ, intervalQ, incompleteT}} = match;
    if (m === '[') {
      charClassDepth++;
      regexContext = RegexContext.CHAR_CLASS;
      charClassContext = CharClassContext.DEFAULT;
    } else if (m === ']' && regexContext === RegexContext.CHAR_CLASS) {
      if (charClassDepth) {
        charClassDepth--;
      }
      if (!charClassDepth) {
        regexContext = RegexContext.DEFAULT;
      }
      charClassContext = CharClassContext.DEFAULT;
    } else if (regexContext === RegexContext.CHAR_CLASS) {
      if (incompleteT) {
        charClassContext = CharClassContext.INVALID_INCOMPLETE_TOKEN;
      } else if (m === '-') {
        charClassContext = CharClassContext.RANGE;
      } else if (enclosedPU) {
        charClassContext = m[1] === 'u' ? CharClassContext.ENCLOSED_U : CharClassContext.ENCLOSED_P;
      } else if (enclosedQ) {
        charClassContext = CharClassContext.ENCLOSED_Q;
      } else if (
        (m === '}' && enclosedTokenCharClassContexts.has(charClassContext)) ||
        // Don't continue in these contexts since we've advanced another token
        charClassContext === CharClassContext.INVALID_INCOMPLETE_TOKEN ||
        charClassContext === CharClassContext.RANGE
      ) {
        charClassContext = CharClassContext.DEFAULT;
      }
    } else {
      if (incompleteT) {
        regexContext = RegexContext.INVALID_INCOMPLETE_TOKEN;
      } else if (groupN) {
        regexContext = RegexContext.GROUP_NAME;
      } else if (enclosedPU) {
        regexContext = m[1] === 'u' ? RegexContext.ENCLOSED_U : RegexContext.ENCLOSED_P;
      } else if (intervalQ) {
        regexContext = RegexContext.INTERVAL_QUANTIFIER;
      } else if (
        (m === '>' && regexContext === RegexContext.GROUP_NAME) ||
        (m === '}' && (regexContext === RegexContext.INTERVAL_QUANTIFIER || enclosedTokenRegexContexts.has(regexContext))) ||
        // Don't continue in this context since we've advanced another token
        regexContext === RegexContext.INVALID_INCOMPLETE_TOKEN
       ) {
        regexContext = RegexContext.DEFAULT;
      }
    }
  }
  return {
    regexContext,
    charClassContext,
    charClassDepth,
    lastPos: incompleteExpression.length,
  };
}

// No special handling for escaped versions of the characters
function getUnbalancedChar(expression, leftChar, rightChar) {
  let numOpen = 0;
  for (const [m] of expression.matchAll(new RegExp(`[${escapeV(leftChar + rightChar, Context.CHAR_CLASS)}]`, 'g'))) {
    numOpen += m === leftChar ? 1 : -1;
    if (numOpen < 0) {
      return rightChar;
    }
  }
  if (numOpen > 0) {
    return leftChar;
  }
  return '';
}

/**
@typedef {import('./regex.js').InterpolatedValue} InterpolatedValue
@typedef {import('./regex.js').RawTemplate} RawTemplate
@typedef {import('./regex.js').RegexTagOptions} RegexTagOptions
@typedef {(
  value: InterpolatedValue,
  runningContext: RunningContext,
  options: Required<RegexTagOptions>
) => {
  transformed: string;
  runningContext: RunningContext;
}} Preprocessor
*/
/**
Returns transformed versions of a template and substitutions, using the given preprocessor. Only
processes substitutions that are instanceof `Pattern`.
@param {RawTemplate} template
@param {ReadonlyArray<InterpolatedValue>} substitutions
@param {Preprocessor} preprocessor
@param {Required<RegexTagOptions>} options
@returns {{template: RawTemplate; substitutions: ReadonlyArray<InterpolatedValue>;}}
*/
function preprocess(template, substitutions, preprocessor, options) {
  let /** @type {RawTemplate} */ newTemplate = {raw: []};
  let newSubstitutions = [];
  let runningContext;
  template.raw.forEach((raw, i) => {
    const result = preprocessor(raw, {...runningContext, lastPos: 0}, options);
    newTemplate.raw.push(result.transformed);
    runningContext = result.runningContext;
    if (i < template.raw.length - 1) {
      const substitution = substitutions[i];
      if (substitution instanceof Pattern) {
        const result = preprocessor(substitution, {...runningContext, lastPos: 0}, options);
        newSubstitutions.push(pattern(result.transformed));
        runningContext = result.runningContext;
      } else {
        newSubstitutions.push(substitution);
      }
    }
  });
  return {
    template: newTemplate,
    substitutions: newSubstitutions,
  };
}

// Sandbox `^` if relevant, done so it can't change the meaning of the surrounding character class
// if we happen to be at the first position. See `sandboxLoneDoublePunctuatorChar` for more details
function sandboxLoneCharClassCaret(str) {
  return str.replace(/^\^/, '\\^^');
}

// Sandbox without escaping by repeating the character and escaping only the first one. The second
// one is so that, if followed by the same symbol, the resulting double punctuator will still throw
// as expected. Details:
// - Only need to check the first position because, if it's part of an implicit union,
//   interpolation handling will wrap it in nested `[…]`.
// - Can't just wrap in nested `[…]` here, since the value might be used in a range.
// - Can't add a second unescaped symbol if a lone symbol is the entire string because it might be
//   followed by the same unescaped symbol outside an interpolation, and since it won't be wrapped,
//   the second symbol wouldn't be sandboxed from the one following it.
function sandboxLoneDoublePunctuatorChar(str) {
  return str.replace(new RegExp(`^([${doublePunctuatorChars}])(?!\\1)`), (m, _, pos) => {
    return `\\${m}${pos + 1 === str.length ? '' : m}`;
  });
}

/**
Converts `\0` tokens to `\x00` in the given context.
@param {string} str
@param {'DEFAULT' | 'CHAR_CLASS'} [context] `Context` option from lib `regex-utilities`
@returns {string}
*/
function sandboxUnsafeNulls(str, context) {
  // regex`[\0${0}]` and regex`[${pattern`\0`}0]` can't be guarded against via nested `[…]`
  // sandboxing in character classes if the interpolated value doesn't contain union (since it
  // might be placed on a range boundary). So escape `\0` in character classes as `\x00`
  return replaceUnescaped(str, String.raw`\\0(?!\d)`, '\\x00', context);
}

export {
  adjustNumberedBackrefs,
  capturingDelim,
  CharClassContext,
  containsCharClassUnion,
  countCaptures,
  doublePunctuatorChars,
  enclosedTokenCharClassContexts,
  enclosedTokenRegexContexts,
  envSupportsFlagGroups,
  envSupportsFlagV,
  escapeV,
  getBreakoutChar,
  getEndContextForIncompleteExpression,
  namedCapturingDelim,
  preprocess,
  RegexContext,
  sandboxLoneCharClassCaret,
  sandboxLoneDoublePunctuatorChar,
  sandboxUnsafeNulls,
};