The part of <string> that matches the <regexp> is replaced by the
evaluated <replacement>, with $<number> in <replacement> expanded to the
corresponding matching sub-expression of <regexp>, with $0 the entire
- matched section. regedit() only replaces the first match.
- regeditall() replaces all matches. The versions ending in i are
- case insensitive. The <replacement> argument is evaluated once for
- each match, allowing for more complex transformations than is
- possible with straight replacement.
+ matched section. If you use named sub-expressions (?P<foo>subexpr), they are
+ referred to with $<foo> (Note that the <>'s are literal).
+
+ regedit() only replaces the first match. regeditall() replaces all matches
+ The versions ending in i are case insensitive. The <replacement>
+ argument is evaluated once for each match, allowing for more complex
+ transformations than is possible with straight replacement.
Example:
- > say regedit(this test is the best string, (.)est, $1rash)
+ > say regedit(this test is the best string, (?P<char>.)est, $<char>rash)
You say "this trash is the best string"
> say regeditall(this test is the best string, (.)est, [capstr($1)]rash)
You say "this Trash is the Brash string"
If <register list> is specified, there is a side-effect: any
parenthesized substrings within the regular expression will be set
- into the specified local registers, in the order they were specified
- in the list. <register list> can be a list of one through nine numbers.
- If the specified register is -1, the substring is not copied into a
- register. Under regmatchi, case of the substring may be modified.
-
+ into the specified local registers. The syntax for this is X:Y, where
+ X is the number (0 is the entire matched text) or name of the substring,
+ and Y is the q-register to save it in. If X: isn't given, the nth substring
+ based on the register's position in the list minus one is used. The first
+ element will have the complete matched text, the second the first substring,
+ and so on. This is to maintain compatibility with old code; it's recommended
+ for new uses that the X:Y syntax be used.
+
For example, in regmatch( cookies=30 , (.+)=(\[0-9\]*) )
(note use of escaping for MUSH parser), then the 0th substring
matched is 'cookies=30', the 1st substring is 'cookies', and the 2nd
- substring is '30'. If <register list> is '0 3 5', then %q0 will become
+ substring is '30'. If <register list> is '0:0 1:3 2:5', then %q0 will become
"cookies=30", %q3 will become "cookies", and %q5 will become "30".
- If <register list> was '0 -1 5', then the "cookies" substring would
+ If <register list> was '0:0 2:5', then the "cookies" substring would
simply be discarded.
See 'help regexp syntax' for an explanation of regular expressions.
#endif
/* From cque.c */
+struct real_pcre;
struct eval_context {
char *wenv[10]; /**< working environment (%0-%9) */
char renv[NUMQ][BUFFER_LEN]; /**< working registers q0-q9,qa-qz */
char ucom[BUFFER_LEN]; /**< evaluated command */
int break_called; /**< Has the break command been called? */
char break_replace[BUFFER_LEN]; /**< What to replace the break with */
+ struct real_pcre *re_code; /**< The compiled re */
int re_subpatterns; /**< The number of re subpatterns */
int *re_offsets; /**< The offsets for the subpatterns */
char *re_from; /**< The positions of the subpatterns */
char abuf[BUFFER_LEN], *abp;
char prebuf[BUFFER_LEN], *prep;
char postbuf[BUFFER_LEN], *postp;
+ pcre *old_re_code;
int flags = 0, all = 0, match_offset = 0, len, funccount;
int i;
/* Now copy in the replacement, putting in captured sub-expressions */
obp = args[i + 1];
+ global_eval_context.re_code = re;
global_eval_context.re_from = prebuf;
global_eval_context.re_offsets = offsets;
global_eval_context.re_subpatterns = subpatterns;
if (study)
mush_free((Malloc_t) study, "pcre.extra");
+ global_eval_context.re_code = old_re_code;
global_eval_context.re_offsets = old_re_offsets;
global_eval_context.re_subpatterns = old_re_subpatterns;
global_eval_context.re_from = old_re_from;
* the results of a regexp pattern match into a set of r()-registers.
*
* regmatch(string, pattern, list of registers)
- * If the number of matches exceeds the registers, those bits are tossed
- * out.
- * If -1 is specified as a register number, the matching bit is tossed.
- * Therefore, if the list is "-1 0 3 5", the regexp $0 is tossed, and
- * the regexp $1, $2, and $3 become r(0), r(3), and r(5), respectively.
+ * Registers are by position (old way) or name:register (new way)
*
*/
int i, nqregs, curq;
subpatterns = 33;
nqregs = list2arr(qregs, NUMQ, args[2], ' ');
for (i = 0; i < nqregs; i++) {
- if (qregs[i] && qregs[i][0] && !qregs[i][1] &&
- ((qindex = qreg_indexes[(unsigned char) qregs[i][0]]) != -1))
+ char *regname;
+ char *named_subpattern = NULL;
+ int subpattern = 0;
+ if ((regname = strchr(qregs[i], ':'))) {
+ /* subexpr:register */
+ *regname++ = '\0';
+ if (is_strict_integer(qregs[i]))
+ subpattern = parse_integer(qregs[i]);
+ else
+ named_subpattern = qregs[i];
+ } else {
+ /* Get subexpr by position in list */
+ subpattern = i;
+ regname = qregs[i];
+ }
+
+ if (regname && regname[0] && !regname[1] &&
+ ((qindex = qreg_indexes[(unsigned char) regname[0]]) != -1))
curq = qindex;
else
curq = -1;
if (curq < 0 || curq >= NUMQ)
continue;
+
if (subpatterns < 0)
global_eval_context.renv[curq][0] = '\0';
+ else if (named_subpattern)
+ pcre_copy_named_substring(re, args[0], offsets, subpatterns,
+ named_subpattern,
+ global_eval_context.renv[curq], BUFFER_LEN);
else
- pcre_copy_substring(args[0], offsets, subpatterns, i, global_eval_context.renv[curq],
- BUFFER_LEN);
+ pcre_copy_substring(args[0], offsets, subpatterns, subpattern,
+ global_eval_context.renv[curq], BUFFER_LEN);
}
mush_free((Malloc_t) re, "pcre");
}
global_eval_context.rnxt[j] = NULL;
/* Initialize the regexp patterns to nothing */
+ global_eval_context.re_code = NULL;
global_eval_context.re_subpatterns = -1;
global_eval_context.re_offsets = NULL;
global_eval_context.re_from = NULL;
global_eval_context.re_subpatterns >= 0) {
char obuf[BUFFER_LEN];
int p = 0;
+ char subspace[BUFFER_LEN];
+ char *named_substring = NULL;
+ obuf[0] = '\0';
(*str)++;
/* Check the first two characters after the $ for a number */
if (isdigit((unsigned char) **str)) {
safe_number(p, buff, bp);
}
}
+ /* Look for a named subexpression */
+ } else if (**str == '<') {
+ char *nbuf = subspace;
+ (*str)++;
+ for (; *str && **str != '>'; (*str)++)
+ safe_chr(**str, subspace, &nbuf);
+ *nbuf = '\0';
+ if (*str)
+ (*str)++;
+ if (is_strict_integer(subspace))
+ p = abs(parse_integer(subspace));
+ else
+ named_substring = subspace;
} else {
safe_chr('$', buff, bp);
break;
}
- if (p >= global_eval_context.re_subpatterns ||
+ if ((!named_substring && p >= global_eval_context.re_subpatterns) ||
global_eval_context.re_offsets == NULL ||
global_eval_context.re_from == NULL) {
/* It's out of bounds, return */
safe_chr('$', buff, bp);
- safe_number(p, buff, bp);
+ if (named_substring)
+ safe_format(buff, bp, "<%s>", named_substring);
+ else
+ safe_integer(p, buff, bp);
break;
}
- pcre_copy_substring(global_eval_context.re_from,
- global_eval_context.re_offsets,
- global_eval_context.re_subpatterns,
- p, obuf, BUFFER_LEN);
+ if (named_substring) {
+ pcre_copy_named_substring(global_eval_context.re_code,
+ global_eval_context.re_from,
+ global_eval_context.re_offsets,
+ global_eval_context.re_subpatterns,
+ named_substring, obuf, BUFFER_LEN);
+ } else {
+ pcre_copy_substring(global_eval_context.re_from,
+ global_eval_context.re_offsets,
+ global_eval_context.re_subpatterns,
+ p, obuf, BUFFER_LEN);
+ }
safe_str(obuf, buff, bp);
} else {
safe_chr('$', buff, bp);