From 87d0951b6969d83c858c114ac0be3fdffd5049f7 Mon Sep 17 00:00:00 2001 From: Ari Johnson Date: Sat, 3 Mar 2007 00:19:58 +0000 Subject: [PATCH] regmatch()/regedit() improved subpattern-to-%q syntax (cherry picked from commit 46e8858b626c7c364b1af70ddd2a90b11500652c) --- game/txt/hlp/cobra_func.hlp | 31 ++++++++++++++++------------- hdrs/externs.h | 2 ++ src/funlist.c | 38 +++++++++++++++++++++++++++--------- src/game.c | 1 + src/parse.c | 39 +++++++++++++++++++++++++++++++------ 5 files changed, 83 insertions(+), 28 deletions(-) diff --git a/game/txt/hlp/cobra_func.hlp b/game/txt/hlp/cobra_func.hlp index af86e61..2fcf9a4 100644 --- a/game/txt/hlp/cobra_func.hlp +++ b/game/txt/hlp/cobra_func.hlp @@ -2890,14 +2890,16 @@ for an object named "Test", preferring a thing over other types. The part of that matches the is replaced by the evaluated , with $ in expanded to the corresponding matching sub-expression of , with $0 the entire - matched section. regedit() only replaces the first match. - regeditall() replaces all matches. The versions ending in i are - case insensitive. The argument is evaluated once for - each match, allowing for more complex transformations than is - possible with straight replacement. + matched section. If you use named sub-expressions (?Psubexpr), they are + referred to with $ (Note that the <>'s are literal). + + regedit() only replaces the first match. regeditall() replaces all matches + The versions ending in i are case insensitive. The + argument is evaluated once for each match, allowing for more complex + transformations than is possible with straight replacement. Example: - > say regedit(this test is the best string, (.)est, $1rash) + > say regedit(this test is the best string, (?P.)est, $rash) You say "this trash is the best string" > say regeditall(this test is the best string, (.)est, [capstr($1)]rash) You say "this Trash is the Brash string" @@ -2915,17 +2917,20 @@ for an object named "Test", preferring a thing over other types. If is specified, there is a side-effect: any parenthesized substrings within the regular expression will be set - into the specified local registers, in the order they were specified - in the list. can be a list of one through nine numbers. - If the specified register is -1, the substring is not copied into a - register. Under regmatchi, case of the substring may be modified. - + into the specified local registers. The syntax for this is X:Y, where + X is the number (0 is the entire matched text) or name of the substring, + and Y is the q-register to save it in. If X: isn't given, the nth substring + based on the register's position in the list minus one is used. The first + element will have the complete matched text, the second the first substring, + and so on. This is to maintain compatibility with old code; it's recommended + for new uses that the X:Y syntax be used. + For example, in regmatch( cookies=30 , (.+)=(\[0-9\]*) ) (note use of escaping for MUSH parser), then the 0th substring matched is 'cookies=30', the 1st substring is 'cookies', and the 2nd - substring is '30'. If is '0 3 5', then %q0 will become + substring is '30'. If is '0:0 1:3 2:5', then %q0 will become "cookies=30", %q3 will become "cookies", and %q5 will become "30". - If was '0 -1 5', then the "cookies" substring would + If was '0:0 2:5', then the "cookies" substring would simply be discarded. See 'help regexp syntax' for an explanation of regular expressions. diff --git a/hdrs/externs.h b/hdrs/externs.h index 03770eb..50da19a 100644 --- a/hdrs/externs.h +++ b/hdrs/externs.h @@ -199,6 +199,7 @@ extern char ucbuff[]; #endif /* From cque.c */ +struct real_pcre; struct eval_context { char *wenv[10]; /**< working environment (%0-%9) */ char renv[NUMQ][BUFFER_LEN]; /**< working registers q0-q9,qa-qz */ @@ -210,6 +211,7 @@ struct eval_context { char ucom[BUFFER_LEN]; /**< evaluated command */ int break_called; /**< Has the break command been called? */ char break_replace[BUFFER_LEN]; /**< What to replace the break with */ + struct real_pcre *re_code; /**< The compiled re */ int re_subpatterns; /**< The number of re subpatterns */ int *re_offsets; /**< The offsets for the subpatterns */ char *re_from; /**< The positions of the subpatterns */ diff --git a/src/funlist.c b/src/funlist.c index 9d14a7a..9ae1e5f 100644 --- a/src/funlist.c +++ b/src/funlist.c @@ -3043,6 +3043,7 @@ FUNCTION(fun_regreplace) char abuf[BUFFER_LEN], *abp; char prebuf[BUFFER_LEN], *prep; char postbuf[BUFFER_LEN], *postp; + pcre *old_re_code; int flags = 0, all = 0, match_offset = 0, len, funccount; int i; @@ -3128,6 +3129,7 @@ FUNCTION(fun_regreplace) /* Now copy in the replacement, putting in captured sub-expressions */ obp = args[i + 1]; + global_eval_context.re_code = re; global_eval_context.re_from = prebuf; global_eval_context.re_offsets = offsets; global_eval_context.re_subpatterns = subpatterns; @@ -3158,6 +3160,7 @@ FUNCTION(fun_regreplace) if (study) mush_free((Malloc_t) study, "pcre.extra"); + global_eval_context.re_code = old_re_code; global_eval_context.re_offsets = old_re_offsets; global_eval_context.re_subpatterns = old_re_subpatterns; global_eval_context.re_from = old_re_from; @@ -3177,11 +3180,7 @@ FUNCTION(fun_regmatch) * the results of a regexp pattern match into a set of r()-registers. * * regmatch(string, pattern, list of registers) - * If the number of matches exceeds the registers, those bits are tossed - * out. - * If -1 is specified as a register number, the matching bit is tossed. - * Therefore, if the list is "-1 0 3 5", the regexp $0 is tossed, and - * the regexp $1, $2, and $3 become r(0), r(3), and r(5), respectively. + * Registers are by position (old way) or name:register (new way) * */ int i, nqregs, curq; @@ -3221,18 +3220,39 @@ FUNCTION(fun_regmatch) subpatterns = 33; nqregs = list2arr(qregs, NUMQ, args[2], ' '); for (i = 0; i < nqregs; i++) { - if (qregs[i] && qregs[i][0] && !qregs[i][1] && - ((qindex = qreg_indexes[(unsigned char) qregs[i][0]]) != -1)) + char *regname; + char *named_subpattern = NULL; + int subpattern = 0; + if ((regname = strchr(qregs[i], ':'))) { + /* subexpr:register */ + *regname++ = '\0'; + if (is_strict_integer(qregs[i])) + subpattern = parse_integer(qregs[i]); + else + named_subpattern = qregs[i]; + } else { + /* Get subexpr by position in list */ + subpattern = i; + regname = qregs[i]; + } + + if (regname && regname[0] && !regname[1] && + ((qindex = qreg_indexes[(unsigned char) regname[0]]) != -1)) curq = qindex; else curq = -1; if (curq < 0 || curq >= NUMQ) continue; + if (subpatterns < 0) global_eval_context.renv[curq][0] = '\0'; + else if (named_subpattern) + pcre_copy_named_substring(re, args[0], offsets, subpatterns, + named_subpattern, + global_eval_context.renv[curq], BUFFER_LEN); else - pcre_copy_substring(args[0], offsets, subpatterns, i, global_eval_context.renv[curq], - BUFFER_LEN); + pcre_copy_substring(args[0], offsets, subpatterns, subpattern, + global_eval_context.renv[curq], BUFFER_LEN); } mush_free((Malloc_t) re, "pcre"); } diff --git a/src/game.c b/src/game.c index f010f0d..fd79b95 100644 --- a/src/game.c +++ b/src/game.c @@ -710,6 +710,7 @@ do_restart(void) global_eval_context.rnxt[j] = NULL; /* Initialize the regexp patterns to nothing */ + global_eval_context.re_code = NULL; global_eval_context.re_subpatterns = -1; global_eval_context.re_offsets = NULL; global_eval_context.re_from = NULL; diff --git a/src/parse.c b/src/parse.c index 5f7c98a..e33ff73 100644 --- a/src/parse.c +++ b/src/parse.c @@ -656,7 +656,10 @@ process_expression(char *buff, char **bp, char const **str, global_eval_context.re_subpatterns >= 0) { char obuf[BUFFER_LEN]; int p = 0; + char subspace[BUFFER_LEN]; + char *named_substring = NULL; + obuf[0] = '\0'; (*str)++; /* Check the first two characters after the $ for a number */ if (isdigit((unsigned char) **str)) { @@ -672,24 +675,48 @@ process_expression(char *buff, char **bp, char const **str, safe_number(p, buff, bp); } } + /* Look for a named subexpression */ + } else if (**str == '<') { + char *nbuf = subspace; + (*str)++; + for (; *str && **str != '>'; (*str)++) + safe_chr(**str, subspace, &nbuf); + *nbuf = '\0'; + if (*str) + (*str)++; + if (is_strict_integer(subspace)) + p = abs(parse_integer(subspace)); + else + named_substring = subspace; } else { safe_chr('$', buff, bp); break; } - if (p >= global_eval_context.re_subpatterns || + if ((!named_substring && p >= global_eval_context.re_subpatterns) || global_eval_context.re_offsets == NULL || global_eval_context.re_from == NULL) { /* It's out of bounds, return */ safe_chr('$', buff, bp); - safe_number(p, buff, bp); + if (named_substring) + safe_format(buff, bp, "<%s>", named_substring); + else + safe_integer(p, buff, bp); break; } - pcre_copy_substring(global_eval_context.re_from, - global_eval_context.re_offsets, - global_eval_context.re_subpatterns, - p, obuf, BUFFER_LEN); + if (named_substring) { + pcre_copy_named_substring(global_eval_context.re_code, + global_eval_context.re_from, + global_eval_context.re_offsets, + global_eval_context.re_subpatterns, + named_substring, obuf, BUFFER_LEN); + } else { + pcre_copy_substring(global_eval_context.re_from, + global_eval_context.re_offsets, + global_eval_context.re_subpatterns, + p, obuf, BUFFER_LEN); + } safe_str(obuf, buff, bp); } else { safe_chr('$', buff, bp); -- 2.30.2