regmatch()/regedit() improved subpattern-to-%q syntax

author Ari Johnson <ari@cobramush.org>

Sat, 3 Mar 2007 00:19:58 +0000 (00:19 +0000)

committer Ari Johnson <ari@cobramush.org>

Sat, 3 Mar 2007 00:19:58 +0000 (00:19 +0000)
author Ari Johnson <ari@cobramush.org>
Sat, 3 Mar 2007 00:19:58 +0000 (00:19 +0000)
committer Ari Johnson <ari@cobramush.org>
Sat, 3 Mar 2007 00:19:58 +0000 (00:19 +0000)
diff --git a/game/txt/hlp/cobra_func.hlp b/game/txt/hlp/cobra_func.hlp

index af86e616f1ab65f692104b4c27ad9e73dd4f4ec0..2fcf9a4e3a4b84c4d02246d84eb19721ec42c8fa 100644 (file)
--- a/game/txt/hlp/cobra_func.hlp
+++ b/game/txt/hlp/cobra_func.hlp
@@ -2890,14 +2890,16 @@ for an object named "Test", preferring a thing over other types.
    The part of <string> that matches the <regexp> is replaced by the
    evaluated <replacement>, with $<number> in <replacement> expanded to the
    corresponding matching sub-expression of <regexp>, with $0 the entire
-  matched section. regedit() only replaces the first match.
-  regeditall() replaces all matches. The versions ending in i are
-  case insensitive. The <replacement> argument is evaluated once for
-  each match, allowing for more complex transformations than is
-  possible with straight replacement.
+  matched section. If you use named sub-expressions (?P<foo>subexpr), they are
+  referred to with $<foo> (Note that the <>'s are literal).
+
+  regedit() only replaces the first match. regeditall() replaces all matches
+  The versions ending in i are case insensitive. The <replacement>
+  argument is evaluated once for each match, allowing for more complex
+  transformations than is possible with straight replacement.
  
    Example:
-  > say regedit(this test is the best string, (.)est, $1rash)
+  > say regedit(this test is the best string, (?P<char>.)est, $<char>rash)
    You say "this trash is the best string"
    > say regeditall(this test is the best string, (.)est, [capstr($1)]rash)
    You say "this Trash is the Brash string"
@@ -2915,17 +2917,20 @@ for an object named "Test", preferring a thing over other types.
   
    If <register list> is specified, there is a side-effect: any
    parenthesized substrings within the regular expression will be set
-  into the specified local registers, in the order they were specified
-  in the list. <register list> can be a list of one through nine numbers.
-  If the specified register is -1, the substring is not copied into a
-  register. Under regmatchi, case of the substring may be modified.
- 
+  into the specified local registers. The syntax for this is X:Y, where
+  X is the number (0 is the entire matched text) or name of the substring,
+  and Y is the q-register to save it in. If X: isn't given, the nth substring
+  based on the register's position in the list minus one is used. The first
+  element will have the complete matched text, the second the first substring,
+  and so on. This is to maintain compatibility with old code; it's recommended
+  for new uses that the X:Y syntax be used.
+
    For example, in regmatch( cookies=30 , (.+)=(\[0-9\]*) )
    (note use of escaping for MUSH parser), then the 0th substring
    matched is 'cookies=30', the 1st substring is 'cookies', and the 2nd
-  substring is '30'. If <register list> is '0 3 5', then %q0 will become
+  substring is '30'. If <register list> is '0:0 1:3 2:5', then %q0 will become
    "cookies=30", %q3 will become "cookies", and %q5 will become "30".
-  If <register list> was '0 -1 5', then the "cookies" substring would
+  If <register list> was '0:0 2:5', then the "cookies" substring would
    simply be discarded.
   
    See 'help regexp syntax' for an explanation of regular expressions.
diff --git a/hdrs/externs.h b/hdrs/externs.h

index 03770ebd9078d913144433dbf4e8c2c4166ba2cf..50da19acfa992dd020337fa1abdb4ae64954c351 100644 (file)
--- a/hdrs/externs.h
+++ b/hdrs/externs.h
@@ -199,6 +199,7 @@ extern char ucbuff[];
  #endif
  
  /* From cque.c */
+struct real_pcre;
  struct eval_context {
    char *wenv[10];                 /**< working environment (%0-%9) */
    char renv[NUMQ][BUFFER_LEN];    /**< working registers q0-q9,qa-qz */
@@ -210,6 +211,7 @@ struct eval_context {
    char ucom[BUFFER_LEN];      /**< evaluated command */
    int break_called;           /**< Has the break command been called? */
    char break_replace[BUFFER_LEN];  /**< What to replace the break with */
+  struct real_pcre *re_code;             /**< The compiled re */
    int re_subpatterns;        /**< The number of re subpatterns */
    int *re_offsets;           /**< The offsets for the subpatterns */
    char *re_from;             /**< The positions of the subpatterns */
diff --git a/src/funlist.c b/src/funlist.c

index 9d14a7a834cbe3aa9ced8496e38355c2f30fb22d..9ae1e5f5f490515090ef128d8d254eb5110a383a 100644 (file)
--- a/src/funlist.c
+++ b/src/funlist.c
@@ -3043,6 +3043,7 @@ FUNCTION(fun_regreplace)
    char abuf[BUFFER_LEN], *abp;
    char prebuf[BUFFER_LEN], *prep;
    char postbuf[BUFFER_LEN], *postp;
+  pcre *old_re_code;
    int flags = 0, all = 0, match_offset = 0, len, funccount;
    int i;
  
@@ -3128,6 +3129,7 @@ FUNCTION(fun_regreplace)
  
        /* Now copy in the replacement, putting in captured sub-expressions */
        obp = args[i + 1];
+      global_eval_context.re_code = re;
        global_eval_context.re_from = prebuf;
        global_eval_context.re_offsets = offsets;
        global_eval_context.re_subpatterns = subpatterns;
@@ -3158,6 +3160,7 @@ FUNCTION(fun_regreplace)
      if (study)
        mush_free((Malloc_t) study, "pcre.extra");
  
+    global_eval_context.re_code = old_re_code;
      global_eval_context.re_offsets = old_re_offsets;
      global_eval_context.re_subpatterns = old_re_subpatterns;
      global_eval_context.re_from = old_re_from;
@@ -3177,11 +3180,7 @@ FUNCTION(fun_regmatch)
   * the results of a regexp pattern match into a set of r()-registers.
   *
   * regmatch(string, pattern, list of registers)
- * If the number of matches exceeds the registers, those bits are tossed
- * out.
- * If -1 is specified as a register number, the matching bit is tossed.
- * Therefore, if the list is "-1 0 3 5", the regexp $0 is tossed, and
- * the regexp $1, $2, and $3 become r(0), r(3), and r(5), respectively.
+ * Registers are by position (old way) or name:register (new way)
   *
   */
    int i, nqregs, curq;
@@ -3221,18 +3220,39 @@ FUNCTION(fun_regmatch)
      subpatterns = 33;
    nqregs = list2arr(qregs, NUMQ, args[2], ' ');
    for (i = 0; i < nqregs; i++) {
-    if (qregs[i] && qregs[i][0] && !qregs[i][1] &&
-       ((qindex = qreg_indexes[(unsigned char) qregs[i][0]]) != -1))
+    char *regname;
+    char *named_subpattern = NULL;
+    int subpattern = 0;
+    if ((regname = strchr(qregs[i], ':'))) {
+      /* subexpr:register */
+      *regname++ = '\0';
+      if (is_strict_integer(qregs[i]))
+       subpattern = parse_integer(qregs[i]);
+      else
+       named_subpattern = qregs[i];
+    } else {
+      /* Get subexpr by position in list */
+      subpattern = i;
+      regname = qregs[i];
+    }
+
+    if (regname && regname[0] && !regname[1] &&
+       ((qindex = qreg_indexes[(unsigned char) regname[0]]) != -1))
        curq = qindex;
      else
        curq = -1;
      if (curq < 0 || curq >= NUMQ)
        continue;
+
      if (subpatterns < 0)
        global_eval_context.renv[curq][0] = '\0';
+    else if (named_subpattern)
+      pcre_copy_named_substring(re, args[0], offsets, subpatterns,
+                               named_subpattern,
+                               global_eval_context.renv[curq], BUFFER_LEN);
      else
-      pcre_copy_substring(args[0], offsets, subpatterns, i, global_eval_context.renv[curq],
-                         BUFFER_LEN);
+      pcre_copy_substring(args[0], offsets, subpatterns, subpattern,
+                         global_eval_context.renv[curq], BUFFER_LEN);
    }
    mush_free((Malloc_t) re, "pcre");
  }
diff --git a/src/game.c b/src/game.c

index f010f0dc7761c8adb3a14bf68c7963e037edcd58..fd79b9583cc09a1ed87d81c81d479bed1bb32eee 100644 (file)
--- a/src/game.c
+++ b/src/game.c
@@ -710,6 +710,7 @@ do_restart(void)
      global_eval_context.rnxt[j] = NULL;
  
    /* Initialize the regexp patterns to nothing */
+  global_eval_context.re_code = NULL;
    global_eval_context.re_subpatterns = -1;
    global_eval_context.re_offsets = NULL;
    global_eval_context.re_from = NULL;
diff --git a/src/parse.c b/src/parse.c

index 5f7c98aab6d5fbb4fabf834db42261ee07cfa0f1..e33ff73f2e389cf91184c87ec6d24f6819dfb353 100644 (file)
--- a/src/parse.c
+++ b/src/parse.c
@@ -656,7 +656,10 @@ process_expression(char *buff, char **bp, char const **str,
           global_eval_context.re_subpatterns >= 0) {
         char obuf[BUFFER_LEN];
         int p = 0;
+        char subspace[BUFFER_LEN];
+        char *named_substring = NULL;
  
+        obuf[0] = '\0';
         (*str)++;
         /* Check the first two characters after the $ for a number */
         if (isdigit((unsigned char) **str)) {
@@ -672,24 +675,48 @@ process_expression(char *buff, char **bp, char const **str,
               safe_number(p, buff, bp);
             }
           }
+         /* Look for a named subexpression */
+       } else if (**str == '<') {
+         char *nbuf = subspace;
+         (*str)++;
+         for (; *str && **str != '>'; (*str)++)
+           safe_chr(**str, subspace, &nbuf);
+         *nbuf = '\0';
+         if (*str)
+           (*str)++;
+         if (is_strict_integer(subspace))
+           p = abs(parse_integer(subspace));
+         else
+           named_substring = subspace;
         } else {
           safe_chr('$', buff, bp);
           break;
         }
  
-       if (p >= global_eval_context.re_subpatterns ||
+       if ((!named_substring && p >= global_eval_context.re_subpatterns) ||
             global_eval_context.re_offsets == NULL ||
             global_eval_context.re_from == NULL) {
           /* It's out of bounds, return */
           safe_chr('$', buff, bp);
-         safe_number(p, buff, bp);
+         if (named_substring)
+           safe_format(buff, bp, "<%s>", named_substring);
+         else
+           safe_integer(p, buff, bp);
           break;
         }
  
-       pcre_copy_substring(global_eval_context.re_from,
-                           global_eval_context.re_offsets,
-                           global_eval_context.re_subpatterns,
-                           p, obuf, BUFFER_LEN);
+       if (named_substring) {
+         pcre_copy_named_substring(global_eval_context.re_code,
+                                   global_eval_context.re_from,
+                                   global_eval_context.re_offsets,
+                                   global_eval_context.re_subpatterns,
+                                   named_substring, obuf, BUFFER_LEN);
+       } else {
+         pcre_copy_substring(global_eval_context.re_from,
+                             global_eval_context.re_offsets,
+                             global_eval_context.re_subpatterns,
+                             p, obuf, BUFFER_LEN);
+       }
         safe_str(obuf, buff, bp);
        } else {
         safe_chr('$', buff, bp);
author	Ari Johnson <ari@cobramush.org>
	Sat, 3 Mar 2007 00:19:58 +0000 (00:19 +0000)
committer	Ari Johnson <ari@cobramush.org>
	Sat, 3 Mar 2007 00:19:58 +0000 (00:19 +0000)
game/txt/hlp/cobra_func.hlp		patch \| blob \| history
hdrs/externs.h		patch \| blob \| history
src/funlist.c		patch \| blob \| history
src/game.c		patch \| blob \| history
src/parse.c		patch \| blob \| history