From: Ari Johnson Date: Sat, 3 Mar 2007 00:19:50 +0000 (+0000) Subject: Updated pcre X-Git-Url: https://git.theari.com/?a=commitdiff_plain;h=e62bfefbb986bffda31b09e9d286ee48e3435ead;p=cobramush.git Updated pcre (cherry picked from commit 0e3f61a9d1920cf4d538f2aba3c1ec433998e84e) --- diff --git a/hdrs/pcre.h b/hdrs/pcre.h index 38e9e00..65f7a85 100644 --- a/hdrs/pcre.h +++ b/hdrs/pcre.h @@ -2,7 +2,39 @@ * Perl-Compatible Regular Expressions * *************************************************/ -/* Copyright (c) 1997-2003 University of Cambridge */ +/* In its original form, this is the .in file that is transformed by +"configure" into pcre.h. + + Copyright (c) 1997-2005 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ /* Modified a bit by Shawn Wagner for inclusion in PennMUSH. See pcre.c for details. */ @@ -10,9 +42,9 @@ #ifndef _PCRE_H #define _PCRE_H -#define PCRE_MAJOR 4 -#define PCRE_MINOR 2 -#define PCRE_DATE 14-Apr-2003 +#define PCRE_MAJOR 6 +#define PCRE_MINOR 4 +#define PCRE_DATE 05-Sep-2005 #ifndef PCRE_DATA_SCOPE # define PCRE_DATA_SCOPE extern @@ -31,32 +63,49 @@ extern "C" { /* Options */ -#define PCRE_CASELESS 0x0001 -#define PCRE_MULTILINE 0x0002 -#define PCRE_DOTALL 0x0004 -#define PCRE_EXTENDED 0x0008 -#define PCRE_ANCHORED 0x0010 -#define PCRE_DOLLAR_ENDONLY 0x0020 -#define PCRE_EXTRA 0x0040 -#define PCRE_NOTBOL 0x0080 -#define PCRE_NOTEOL 0x0100 -#define PCRE_UNGREEDY 0x0200 -#define PCRE_NOTEMPTY 0x0400 -#define PCRE_UTF8 0x0800 -#define PCRE_NO_AUTO_CAPTURE 0x1000 -#define PCRE_NO_UTF8_CHECK 0x2000 +#define PCRE_CASELESS 0x00000001 +#define PCRE_MULTILINE 0x00000002 +#define PCRE_DOTALL 0x00000004 +#define PCRE_EXTENDED 0x00000008 +#define PCRE_ANCHORED 0x00000010 +#define PCRE_DOLLAR_ENDONLY 0x00000020 +#define PCRE_EXTRA 0x00000040 +#define PCRE_NOTBOL 0x00000080 +#define PCRE_NOTEOL 0x00000100 +#define PCRE_UNGREEDY 0x00000200 +#define PCRE_NOTEMPTY 0x00000400 +#define PCRE_UTF8 0x00000800 +#define PCRE_NO_AUTO_CAPTURE 0x00001000 +#define PCRE_NO_UTF8_CHECK 0x00002000 +#define PCRE_AUTO_CALLOUT 0x00004000 +#define PCRE_PARTIAL 0x00008000 +#define PCRE_DFA_SHORTEST 0x00010000 +#define PCRE_DFA_RESTART 0x00020000 +#define PCRE_FIRSTLINE 0x00040000 + /* Exec-time and get/set-time error codes */ -#define PCRE_ERROR_NOMATCH (-1) -#define PCRE_ERROR_NULL (-2) -#define PCRE_ERROR_BADOPTION (-3) -#define PCRE_ERROR_BADMAGIC (-4) -#define PCRE_ERROR_UNKNOWN_NODE (-5) -#define PCRE_ERROR_NOMEMORY (-6) -#define PCRE_ERROR_NOSUBSTRING (-7) -#define PCRE_ERROR_MATCHLIMIT (-8) -#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */ +#define PCRE_ERROR_NOMATCH (-1) +#define PCRE_ERROR_NULL (-2) +#define PCRE_ERROR_BADOPTION (-3) +#define PCRE_ERROR_BADMAGIC (-4) +#define PCRE_ERROR_UNKNOWN_NODE (-5) +#define PCRE_ERROR_NOMEMORY (-6) +#define PCRE_ERROR_NOSUBSTRING (-7) +#define PCRE_ERROR_MATCHLIMIT (-8) +#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */ +#define PCRE_ERROR_BADUTF8 (-10) +#define PCRE_ERROR_BADUTF8_OFFSET (-11) +#define PCRE_ERROR_PARTIAL (-12) +#define PCRE_ERROR_BADPARTIAL (-13) +#define PCRE_ERROR_INTERNAL (-14) +#define PCRE_ERROR_BADCOUNT (-15) +#define PCRE_ERROR_DFA_UITEM (-16) +#define PCRE_ERROR_DFA_UCOND (-17) +#define PCRE_ERROR_DFA_UMLIMIT (-18) +#define PCRE_ERROR_DFA_WSSIZE (-19) +#define PCRE_ERROR_DFA_RECURSE (-20) /* Request types for pcre_fullinfo() */ @@ -72,6 +121,7 @@ extern "C" { #define PCRE_INFO_NAMECOUNT 8 #define PCRE_INFO_NAMETABLE 9 #define PCRE_INFO_STUDYSIZE 10 +#define PCRE_INFO_DEFAULT_TABLES 11 /* Request types for pcre_config() */ @@ -86,6 +136,7 @@ extern "C" { #define PCRE_EXTRA_STUDY_DATA 0x0001 #define PCRE_EXTRA_MATCH_LIMIT 0x0002 #define PCRE_EXTRA_CALLOUT_DATA 0x0004 +#define PCRE_EXTRA_TABLES 0x0008 /* Types */ @@ -100,6 +151,7 @@ such as way as to be extensible. */ void *study_data; /* Opaque data from pcre_study() */ unsigned long int match_limit; /* Maximum number of calls to match() */ void *callout_data; /* Data passed back in callouts */ + const unsigned char *tables; /* Pointer to character tables */ } pcre_extra; /* The structure for passing out data via the pcre_callout_function. We use a @@ -115,10 +167,13 @@ without modification. */ const char *subject; /* The subject being matched */ int subject_length; /* The length of the subject */ int start_match; /* Offset to start of this match attempt */ - int current_position; /* Where we currently are */ + int current_position; /* Where we currently are in the subject */ int capture_top; /* Max current capture */ int capture_last; /* Most recently closed capture */ void *callout_data; /* Data passed in with the call */ + /* ------------------- Added for Version 1 -------------------------- */ + int pattern_position; /* Offset to next item in the pattern */ + int next_item_length; /* Length of next item in the pattern */ /* ------------------------------------------------------------------ */ } pcre_callout_block; @@ -132,6 +187,14 @@ without modification. */ const char *, int, int, int, int *, int); extern const unsigned char *pcre_maketables(void); extern pcre_extra *pcre_study(const pcre *, int, const char **); + extern int pcre_fullinfo(const pcre * argument_re, + const pcre_extra * extra_data, int what, + void *where); + extern int pcre_get_stringnumber(const pcre * code, const char *stringname); + extern int + pcre_copy_named_substring(const pcre * code, const char *subject, + int *ovector, int stringcount, + const char *stringname, char *buffer, int size); #ifdef __cplusplus } /* extern "C" */ diff --git a/src/pcre.c b/src/pcre.c index c5bd811..6d66d2e 100644 --- a/src/pcre.c +++ b/src/pcre.c @@ -3,32 +3,38 @@ *************************************************/ -/* This is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. See -the file Tech.Notes for some information on the internals. +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. -Written by: Philip Hazel - - Copyright (c) 1997-2003 University of Cambridge + Written by Philip Hazel + Copyright (c) 1997-2005 University of Cambridge ----------------------------------------------------------------------------- -Permission is granted to anyone to use this software for any purpose on any -computer system, and to redistribute it freely, subject to the following -restrictions: - -1. This software is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - -2. The origin of this software must not be misrepresented, either by - explicit claim or by omission. - -3. Altered versions must be plainly marked as such, and must not be - misrepresented as being the original software. - -4. If PCRE is embedded in any software that is released under the GNU - General Purpose Licence (GPL), then the terms of that licence shall - supersede any condition above with which it is incompatible. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ @@ -39,12 +45,14 @@ restrictions: * 'isblank' as a variable (reported to Philip Hazel for pcre 4.5) */ #include "config.h" -#include -#include #include -#include -#include +#include #include +#include +#include +#include +#include +#include #include "pcre.h" #include "confmagic.h" #undef min @@ -55,26 +63,54 @@ restrictions: #define LINK_SIZE 2 #define MATCH_LIMIT 100000 #define NO_RECURSE +#define EBCDIC 0 + +/* Bits of pcre_internal.h */ + -/* Bits of internal.h */ -/* This header contains definitions that are shared between the different -modules, but which are not relevant to the outside. */ #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ -#define EXPORT -/* PCRE keeps offsets in its compiled code as 2-byte quantities by default. -These are used, for example, to link from the start of a subpattern to its -alternatives and its end. The use of 2 bytes per offset limits the size of the -compiled regex to around 64K, which is big enough for almost everybody. -However, I received a request for an even bigger limit. For this reason, and -also to make the code easier to maintain, the storing and loading of offsets -from the byte string is now handled by the macros that are defined here. +typedef unsigned char uschar; + +/* We need to have types that specify unsigned 16-bit and 32-bit integers. We +cannot determine these outside the compilation (e.g. by running a program as +part of "configure") because PCRE is often cross-compiled for use on other +systems. Instead we make use of the maximum sizes that are available at +preprocessor time in standard C environments. */ + +#if USHRT_MAX == 65535 +typedef unsigned short pcre_uint16; +#elif UINT_MAX == 65535 +typedef unsigned int pcre_uint16; +#else +#error Cannot determine a type for 16-bit unsigned integers +#endif + +#if UINT_MAX == 4294967295 +typedef unsigned int pcre_uint32; +#elif ULONG_MAX == 4294967295 +typedef unsigned long int pcre_uint32; +#else +#error Cannot determine a type for 32-bit unsigned integers +#endif + + +/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored +in big-endian order) by default. These are used, for example, to link from the +start of a subpattern to its alternatives and its end. The use of 2 bytes per +offset limits the size of the compiled regex to around 64K, which is big enough +for almost everybody. However, I received a request for an even bigger limit. +For this reason, and also to make the code easier to maintain, the storing and +loading of offsets from the byte string is now handled by the macros that are +defined here. The macros are controlled by the value of LINK_SIZE. This defaults to 2 in the config.h file, but can be overridden by using -D on the command line. This is automated on Unix systems via the "configure" command. */ +#define LINK_SIZE 2 + #define PUT(a,n,d) \ (a[n] = (d) >> 8), \ (a[(n)+1] = (d) & 255) @@ -103,6 +139,13 @@ capturing parenthesis numbers in back references. */ #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 +#define GETCHAR(c, eptr) c = *eptr; +#define GETCHARTEST(c, eptr) c = *eptr; +#define GETCHARINC(c, eptr) c = *eptr++; +#define GETCHARINCTEST(c, eptr) c = *eptr++; +#define GETCHARLEN(c, eptr, len) c = *eptr; +#define BACKCHAR(eptr) + /* These are the public options that can change during matching. */ #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) @@ -110,32 +153,38 @@ capturing parenthesis numbers in back references. */ /* Private options flags start at the most significant end of the four bytes, but skip the top bit so we can use ints for convenience without getting tangled with negative values. The public options defined in pcre.h start at the least -significant end. Make sure they don't overlap, though now that we have expanded -to four bytes there is plenty of space. */ +significant end. Make sure they don't overlap! */ #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ +#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ /* Options for the "extra" block produced by pcre_study(). */ #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ -/* Masks for identifying the public options which are permitted at compile -time, run time or study time, respectively. */ +/* Masks for identifying the public options that are permitted at compile +time, run time, or study time, respectively. */ #define PUBLIC_OPTIONS \ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ - PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK) + PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE) #define PUBLIC_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK) + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ + PCRE_PARTIAL) + +#define PUBLIC_DFA_EXEC_OPTIONS \ + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ + PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART) #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ -/* Magic number to provide a small check against being handed junk. */ +/* Magic number to provide a small check against being handed junk. Also used +to detect whether a pattern was compiled on a host of different endianness. */ #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ @@ -144,6 +193,11 @@ time, run time or study time, respectively. */ #define REQ_UNSET (-2) #define REQ_NONE (-1) +/* The maximum remaining length of subject we are prepared to search for a +req_byte match. */ + +#define REQ_BYTE_MAX 1000 + /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a variable-length repeat, or a anything other than literal characters. */ @@ -191,12 +245,13 @@ definitions below, up to ESC_z. There's a dummy for OP_ANY because it corresponds to "." rather than an escape sequence. The final one must be ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two tests in the code for an escape greater than ESC_b and less than ESC_Z to -detect the types that may be repeated. These are the types that consume a -character. If any new escapes are put in between that don't consume a +detect the types that may be repeated. These are the types that consume +characters. If any new escapes are put in between that don't consume a character, that code will have to change. */ enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, - ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF + ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, + ESC_Q, ESC_REF }; /* Flag bits and data types for the extended class (OP_XCLASS) for classes that @@ -208,6 +263,8 @@ contain UTF-8 characters with values greater than 255. */ #define XCL_END 0 /* Marks end of individual items */ #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ +#define XCL_PROP 3 /* Unicode property (one property code) follows */ +#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets @@ -233,110 +290,123 @@ enum { OP_WORDCHAR, /* 10 \w */ OP_ANY, /* 11 Match any character */ OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ - OP_EODN, /* 13 End of data or \n at end of data: \Z. */ - OP_EOD, /* 14 End of data: \z */ - - OP_OPT, /* 15 Set runtime options */ - OP_CIRC, /* 16 Start of line - varies with multiline switch */ - OP_DOLL, /* 17 End of line - varies with multiline switch */ - OP_CHARS, /* 18 Match string of characters */ - OP_NOT, /* 19 Match anything but the following char */ - - OP_STAR, /* 20 The maximizing and minimizing versions of */ - OP_MINSTAR, /* 21 all these opcodes must come in pairs, with */ - OP_PLUS, /* 22 the minimizing one second. */ - OP_MINPLUS, /* 23 This first set applies to single characters */ - OP_QUERY, /* 24 */ - OP_MINQUERY, /* 25 */ - OP_UPTO, /* 26 From 0 to n matches */ - OP_MINUPTO, /* 27 */ - OP_EXACT, /* 28 Exactly n matches */ - - OP_NOTSTAR, /* 29 The maximizing and minimizing versions of */ - OP_NOTMINSTAR, /* 30 all these opcodes must come in pairs, with */ - OP_NOTPLUS, /* 31 the minimizing one second. */ - OP_NOTMINPLUS, /* 32 This set applies to "not" single characters */ - OP_NOTQUERY, /* 33 */ - OP_NOTMINQUERY, /* 34 */ - OP_NOTUPTO, /* 35 From 0 to n matches */ - OP_NOTMINUPTO, /* 36 */ - OP_NOTEXACT, /* 37 Exactly n matches */ - - OP_TYPESTAR, /* 38 The maximizing and minimizing versions of */ - OP_TYPEMINSTAR, /* 39 all these opcodes must come in pairs, with */ - OP_TYPEPLUS, /* 40 the minimizing one second. These codes must */ - OP_TYPEMINPLUS, /* 41 be in exactly the same order as those above. */ - OP_TYPEQUERY, /* 42 This set applies to character types such as \d */ - OP_TYPEMINQUERY, /* 43 */ - OP_TYPEUPTO, /* 44 From 0 to n matches */ - OP_TYPEMINUPTO, /* 45 */ - OP_TYPEEXACT, /* 46 Exactly n matches */ - - OP_CRSTAR, /* 47 The maximizing and minimizing versions of */ - OP_CRMINSTAR, /* 48 all these opcodes must come in pairs, with */ - OP_CRPLUS, /* 49 the minimizing one second. These codes must */ - OP_CRMINPLUS, /* 50 be in exactly the same order as those above. */ - OP_CRQUERY, /* 51 These are for character classes and back refs */ - OP_CRMINQUERY, /* 52 */ - OP_CRRANGE, /* 53 These are different to the three seta above. */ - OP_CRMINRANGE, /* 54 */ - - OP_CLASS, /* 55 Match a character class, chars < 256 only */ - OP_NCLASS, /* 56 Same, but the bitmap was created from a negative + OP_NOTPROP, /* 13 \P (not Unicode property) */ + OP_PROP, /* 14 \p (Unicode property) */ + OP_EXTUNI, /* 15 \X (extended Unicode sequence */ + OP_EODN, /* 16 End of data or \n at end of data: \Z. */ + OP_EOD, /* 17 End of data: \z */ + + OP_OPT, /* 18 Set runtime options */ + OP_CIRC, /* 19 Start of line - varies with multiline switch */ + OP_DOLL, /* 20 End of line - varies with multiline switch */ + OP_CHAR, /* 21 Match one character, casefully */ + OP_CHARNC, /* 22 Match one character, caselessly */ + OP_NOT, /* 23 Match anything but the following char */ + + OP_STAR, /* 24 The maximizing and minimizing versions of */ + OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ + OP_PLUS, /* 26 the minimizing one second. */ + OP_MINPLUS, /* 27 This first set applies to single characters */ + OP_QUERY, /* 28 */ + OP_MINQUERY, /* 29 */ + OP_UPTO, /* 30 From 0 to n matches */ + OP_MINUPTO, /* 31 */ + OP_EXACT, /* 32 Exactly n matches */ + + OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ + OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ + OP_NOTPLUS, /* 35 the minimizing one second. */ + OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ + OP_NOTQUERY, /* 37 */ + OP_NOTMINQUERY, /* 38 */ + OP_NOTUPTO, /* 39 From 0 to n matches */ + OP_NOTMINUPTO, /* 40 */ + OP_NOTEXACT, /* 41 Exactly n matches */ + + OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ + OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ + OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ + OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ + OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ + OP_TYPEMINQUERY, /* 47 */ + OP_TYPEUPTO, /* 48 From 0 to n matches */ + OP_TYPEMINUPTO, /* 49 */ + OP_TYPEEXACT, /* 50 Exactly n matches */ + + OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ + OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ + OP_CRPLUS, /* 53 the minimizing one second. These codes must */ + OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ + OP_CRQUERY, /* 55 These are for character classes and back refs */ + OP_CRMINQUERY, /* 56 */ + OP_CRRANGE, /* 57 These are different to the three sets above. */ + OP_CRMINRANGE, /* 58 */ + + OP_CLASS, /* 59 Match a character class, chars < 256 only */ + OP_NCLASS, /* 60 Same, but the bitmap was created from a negative class - the difference is relevant only when a UTF-8 character > 255 is encountered. */ - OP_XCLASS, /* 57 Extended class for handling UTF-8 chars within the + OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the class. This does both positive and negative. */ - OP_REF, /* 58 Match a back reference */ - OP_RECURSE, /* 59 Match a numbered subpattern (possibly recursive) */ - OP_CALLOUT, /* 60 Call out to external function if provided */ + OP_REF, /* 62 Match a back reference */ + OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ + OP_CALLOUT, /* 64 Call out to external function if provided */ - OP_ALT, /* 61 Start of alternation */ - OP_KET, /* 62 End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* 63 These two must remain together and in this */ - OP_KETRMIN, /* 64 order. They are for groups the repeat for ever. */ + OP_ALT, /* 65 Start of alternation */ + OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* 67 These two must remain together and in this */ + OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ /* The assertions must come before ONCE and COND */ - OP_ASSERT, /* 65 Positive lookahead */ - OP_ASSERT_NOT, /* 66 Negative lookahead */ - OP_ASSERTBACK, /* 67 Positive lookbehind */ - OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */ - OP_REVERSE, /* 69 Move pointer back - used in lookbehind assertions */ + OP_ASSERT, /* 69 Positive lookahead */ + OP_ASSERT_NOT, /* 70 Negative lookahead */ + OP_ASSERTBACK, /* 71 Positive lookbehind */ + OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ + OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ /* ONCE and COND must come after the assertions, with ONCE first, as there's a test for >= ONCE for a subpattern that isn't an assertion. */ - OP_ONCE, /* 70 Once matched, don't back up into the subpattern */ - OP_COND, /* 71 Conditional group */ - OP_CREF, /* 72 Used to hold an extraction string number (cond ref) */ + OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ + OP_COND, /* 75 Conditional group */ + OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ - OP_BRAZERO, /* 73 These two must remain together and in this */ - OP_BRAMINZERO, /* 74 order. */ + OP_BRAZERO, /* 77 These two must remain together and in this */ + OP_BRAMINZERO, /* 78 order. */ - OP_BRANUMBER, /* 75 Used for extracting brackets whose number is greater + OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater than can fit into an opcode. */ - OP_BRA /* 76 This and greater values are used for brackets that - extract substrings up to a basic limit. After that, - use is made of OP_BRANUMBER. */ + OP_BRA /* 80 This and greater values are used for brackets that + extract substrings up to EXTRACT_BASIC_MAX. After + that, use is made of OP_BRANUMBER. */ }; -/* WARNING: There is an implicit assumption in study.c that all opcodes are -less than 128 in value. This makes handling UTF-8 character sequences easier. -*/ +/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and +study.c that all opcodes are less than 128 in value. This makes handling UTF-8 +character sequences easier. */ + +/* The highest extraction number before we have to start using additional +bytes. (Originally PCRE didn't have support for extraction counts highter than +this number.) The value is limited by the number of opcodes left after OP_BRA, +i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional +opcodes. */ +#define EXTRACT_BASIC_MAX 100 -/* This macro defines textual names for all the opcodes. There are used only -for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The -macro is referenced only in printint.c. */ + +/* This macro defines textual names for all the opcodes. These are used only +for debugging. The macro is referenced only in pcre_printint.c. */ #define OP_NAME_LIST \ "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ - "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z", \ - "Opt", "^", "$", "chars", "not", \ + "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ + "notprop", "prop", "extuni", \ + "\\Z", "\\z", \ + "Opt", "^", "$", "char", "charnc", "not", \ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ @@ -350,7 +420,7 @@ macro is referenced only in printint.c. */ /* This macro defines the length of fixed length operations in the compiled regex. The lengths are used when searching for specific things, and also in the debugging printing of a compiled regex. We use a macro so that it can be -incorporated both into pcre.c and pcretest.c without being publicly exposed. +defined close to the definitions of the opcodes themselves. As things have been extended, some of these are no longer fixed lenths, but are minima instead. For example, the length of a single-character repeat may vary @@ -359,8 +429,11 @@ in UTF-8 mode. The code that uses this table must know about such things. */ #define OP_LENGTHS \ 1, /* End */ \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ - 1, 1, 1, 1, 2, 1, 1, /* Any, Anybyte, \Z, \z, Opt, ^, $ */ \ - 2, /* Chars - the minimum length */ \ + 1, 1, /* Any, Anybyte */ \ + 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \ + 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ + 2, /* Char - the minimum length */ \ + 2, /* Charnc - the minimum length */ \ 2, /* not */ \ /* Positive single-char repeats ** These are */ \ 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ @@ -379,7 +452,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 0, /* XCLASS - variable length */ \ 3, /* REF */ \ 1+LINK_SIZE, /* RECURSE */ \ - 2, /* CALLOUT */ \ + 2+2*LINK_SIZE, /* CALLOUT */ \ 1+LINK_SIZE, /* Alt */ \ 1+LINK_SIZE, /* Ket */ \ 1+LINK_SIZE, /* KetRmax */ \ @@ -397,98 +470,62 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+LINK_SIZE /* BRA */ \ -/* The highest extraction number before we have to start using additional -bytes. (Originally PCRE didn't have support for extraction counts highter than -this number.) The value is limited by the number of opcodes left after OP_BRA, -i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional -opcodes. */ - -#define EXTRACT_BASIC_MAX 150 - /* A magic value for OP_CREF to indicate the "in recursion" condition. */ #define CREF_RECURSE 0xffff -/* The texts of compile-time error messages are defined as macros here so that -they can be accessed by the POSIX wrapper and converted into error codes. Yes, -I could have used error codes in the first place, but didn't feel like changing -just to accommodate the POSIX wrapper. */ - -#define ERR1 "\\ at end of pattern" -#define ERR2 "\\c at end of pattern" -#define ERR3 "unrecognized character follows \\" -#define ERR4 "numbers out of order in {} quantifier" -#define ERR5 "number too big in {} quantifier" -#define ERR6 "missing terminating ] for character class" -#define ERR7 "invalid escape sequence in character class" -#define ERR8 "range out of order in character class" -#define ERR9 "nothing to repeat" -#define ERR10 "operand of unlimited repeat could match the empty string" -#define ERR11 "internal error: unexpected repeat" -#define ERR12 "unrecognized character after (?" -#define ERR13 "POSIX named classes are supported only within a class" -#define ERR14 "missing )" -#define ERR15 "reference to non-existent subpattern" -#define ERR16 "erroffset passed as NULL" -#define ERR17 "unknown option bit(s) set" -#define ERR18 "missing ) after comment" -#define ERR19 "parentheses nested too deeply" -#define ERR20 "regular expression too large" -#define ERR21 "failed to get memory" -#define ERR22 "unmatched parentheses" -#define ERR23 "internal error: code overflow" -#define ERR24 "unrecognized character after (?<" -#define ERR25 "lookbehind assertion is not fixed length" -#define ERR26 "malformed number after (?(" -#define ERR27 "conditional group contains more than two branches" -#define ERR28 "assertion expected after (?(" -#define ERR29 "(?R or (?digits must be followed by )" -#define ERR30 "unknown POSIX class name" -#define ERR31 "POSIX collating elements are not supported" -#define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" -#define ERR33 "spare error" -#define ERR34 "character value in \\x{...} sequence is too large" -#define ERR35 "invalid condition (?(0)" -#define ERR36 "\\C not allowed in lookbehind assertion" -#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" -#define ERR38 "number after (?C is > 255" -#define ERR39 "closing ) for (?C expected" -#define ERR40 "recursive call could loop indefinitely" -#define ERR41 "unrecognized character after (?P" -#define ERR42 "syntax error after (?P" -#define ERR43 "two named groups have the same name" -#define ERR44 "invalid UTF-8 string" - -/* All character handling must be done as unsigned characters. Otherwise there -are problems with top-bit-set characters and functions such as isspace(). -However, we leave the interface to the outside world as char *, because that -should make things easier for callers. We define a short type for unsigned char -to save lots of typing. I tried "uchar", but it causes problems on Digital -Unix, where it is defined in sys/types, so use "uschar" instead. */ +/* Error code numbers. They are given names so that they can more easily be +tracked. */ -typedef unsigned char uschar; +enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, + ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, + ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, + ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, + ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 +}; /* The real format of the start of the pcre block; the index of names and the -code vector run on as long as necessary after the end. */ +code vector run on as long as necessary after the end. We store an explicit +offset to the name table so that if a regex is compiled on one host, saved, and +then run on another where the size of pointers is different, all might still +be well. For the case of compiled-on-4 and run-on-8, we include an extra +pointer that is always NULL. For future-proofing, a few dummy fields were +originally included - even though you can never get this planning right - but +there is only one left now. + +NOTE NOTE NOTE: +Because people can now save and re-use compiled patterns, any additions to this +structure should be made at the end, and something earlier (e.g. a new +flag in the options or one of the dummy fields) should indicate that the new +fields are present. Currently PCRE always sets the dummy fields to zero. +NOTE NOTE NOTE: +*/ typedef struct real_pcre { - unsigned long int magic_number; - size_t size; /* Total that was malloced */ - const unsigned char *tables; /* Pointer to tables */ - unsigned long int options; - unsigned short int top_bracket; - unsigned short int top_backref; - unsigned short int first_byte; - unsigned short int req_byte; - unsigned short int name_entry_size; /* Size of any name items; 0 => none */ - unsigned short int name_count; /* Number of name items */ + pcre_uint32 magic_number; + pcre_uint32 size; /* Total that was malloced */ + pcre_uint32 options; + pcre_uint32 dummy1; /* For future use, maybe */ + + pcre_uint16 top_bracket; + pcre_uint16 top_backref; + pcre_uint16 first_byte; + pcre_uint16 req_byte; + pcre_uint16 name_table_offset; /* Offset to name table that follows */ + pcre_uint16 name_entry_size; /* Size of any name items */ + pcre_uint16 name_count; /* Number of name items */ + pcre_uint16 ref_count; /* Reference count */ + + const unsigned char *tables; /* Pointer to tables or NULL for std */ + const unsigned char *nullpad; /* NULL padding */ } real_pcre; -/* The format of the block used to store data from pcre_study(). */ +/* The format of the block used to store data from pcre_study(). The same +remark (see NOTE above) about extending this structure applies. */ typedef struct pcre_study_data { - size_t size; /* Total that was malloced */ - uschar options; + pcre_uint32 size; /* Total that was malloced */ + pcre_uint32 options; uschar start_bits[32]; } pcre_study_data; @@ -501,12 +538,14 @@ typedef struct compile_data { const uschar *cbits; /* Points to character type table */ const uschar *ctypes; /* Points to table of type maps */ const uschar *start_code; /* The start of the compiled code */ + const uschar *start_pattern; /* The start of the pattern */ uschar *name_table; /* The name/number table */ int names_found; /* Number of entries so far */ int name_entry_size; /* Size of each entry */ int top_backref; /* Maximum back reference */ unsigned int backref_map; /* Bitmap of low back refs */ int req_varyopt; /* "After variable item" flag for reqbyte */ + BOOL nopartial; /* Set TRUE if partial won't work */ } compile_data; /* Structure for maintaining a chain of pointers to the currently incomplete @@ -540,7 +579,7 @@ NOTE: This isn't used for a "normal" compilation of pcre. */ struct heapframe; /* Structure for passing "static" information around between the functions -doing the matching, so that they are thread-safe. */ +doing traditional NFA matching, so that they are thread-safe. */ typedef struct match_data { unsigned long int match_call_count; /* As it says */ @@ -556,6 +595,8 @@ typedef struct match_data { BOOL utf8; /* UTF8 flag */ BOOL endonly; /* Dollar not before final \n */ BOOL notempty; /* Empty string match not wanted */ + BOOL partial; /* PARTIAL flag */ + BOOL hitend; /* Hit the end of the subject at some point */ const uschar *start_code; /* For use when recursing */ const uschar *start_subject; /* Start of the subject string */ const uschar *end_subject; /* End of the subject string */ @@ -569,6 +610,19 @@ typedef struct match_data { struct heapframe *thisframe; /* Used only when compiling for no recursion */ } match_data; +/* A similar structure is used for the same purpose by the DFA matching +functions. */ + +typedef struct dfa_match_data { + const uschar *start_code; /* Start of the compiled pattern */ + const uschar *start_subject; /* Start of the subject string */ + const uschar *end_subject; /* End of subject string */ + const uschar *tables; /* Character tables */ + int moptions; /* Match options */ + int poptions; /* Pattern options */ + void *callout_data; /* To pass back to callouts */ +} dfa_match_data; + /* Bit definitions for entries in the pcre_ctypes table. */ #define ctype_space 0x01 @@ -602,8 +656,19 @@ total length. */ #define ctypes_offset (cbits_offset + cbit_length) #define tables_length (ctypes_offset + 256) -/* End of internal.h */ -/* chartables.c */ +/* Layout of the UCP type table that translates property names into codes for +pcre_ucp_findchar(). */ + +typedef struct { + const char *name; + int value; +} ucp_type_table; + + +/* Bits of pcre_globals.c */ +int (*pcre_callout) (pcre_callout_block *) = NULL; + +/* pcre_chartables.c */ /************************************************* * Perl-Compatible Regular Expressions * *************************************************/ @@ -612,11 +677,11 @@ total length. */ program. If you edit it by hand, you might like to edit the Makefile to prevent its ever being regenerated. -This file is #included in the compilation of pcre.c to build the default -character tables which are used when no tables are passed to the compile -function. */ +This file contains the default tables for characters with codes less than +128 (ASCII characters). These tables are used when no external tables are +passed to PCRE. */ -static unsigned char pcre_default_tables[] = { +const unsigned char _pcre_default_tables[] = { /* This table is a lower casing table. */ @@ -788,10 +853,293 @@ print, punct, and cntrl. Other classes are built from combinations. */ }; /* 248-255 */ /* End of chartables.c */ -/* get.c */ + +/* Bits of pcre_tables.c */ + +/* This module contains some fixed tables that are used by more than one of the +PCRE code modules. The tables are also #included by the pcretest program, which +uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name +clashes with the library. */ + + +/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that +the definition is next to the definition of the opcodes in internal.h. */ + +const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; + + + + +/* End of pcre_tables.c */ + + +/* pcre_try_flipped.c */ + +/* This module contains an internal function that tests a compiled pattern to +see if it was compiled with the opposite endianness. If so, it uses an +auxiliary local function to flip the appropriate bytes. */ + + +/************************************************* +* Flip bytes in an integer * +*************************************************/ + +/* This function is called when the magic number in a regex doesn't match, in +order to flip its bytes to see if we are dealing with a pattern that was +compiled on a host of different endianness. If so, this function is used to +flip other byte values. + +Arguments: + value the number to flip + n the number of bytes to flip (assumed to be 2 or 4) + +Returns: the flipped value +*/ + +static long int +byteflip(long int value, int n) +{ + if (n == 2) + return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); + return ((value & 0x000000ff) << 24) | + ((value & 0x0000ff00) << 8) | + ((value & 0x00ff0000) >> 8) | ((value & 0xff000000) >> 24); +} + + + +/************************************************* +* Test for a byte-flipped compiled regex * +*************************************************/ + +/* This function is called from pcre_exec(), pcre_dfa_exec(), and also from +pcre_fullinfo(). Its job is to test whether the regex is byte-flipped - that +is, it was compiled on a system of opposite endianness. The function is called +only when the native MAGIC_NUMBER test fails. If the regex is indeed flipped, +we flip all the relevant values into a different data block, and return it. + +Arguments: + re points to the regex + study points to study data, or NULL + internal_re points to a new regex block + internal_study points to a new study block + +Returns: the new block if is is indeed a byte-flipped regex + NULL if it is not +*/ + +real_pcre *_pcre_try_flipped(const real_pcre * re, real_pcre * internal_re, + const pcre_study_data * study, + pcre_study_data * internal_study); + +real_pcre * +_pcre_try_flipped(const real_pcre * re, real_pcre * internal_re, + const pcre_study_data * study, + pcre_study_data * internal_study) +{ + if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) + return NULL; + + *internal_re = *re; /* To copy other fields */ + internal_re->size = byteflip(re->size, sizeof(re->size)); + internal_re->options = byteflip(re->options, sizeof(re->options)); + internal_re->top_bracket = + (pcre_uint16) byteflip(re->top_bracket, sizeof(re->top_bracket)); + internal_re->top_backref = + (pcre_uint16) byteflip(re->top_backref, sizeof(re->top_backref)); + internal_re->first_byte = + (pcre_uint16) byteflip(re->first_byte, sizeof(re->first_byte)); + internal_re->req_byte = + (pcre_uint16) byteflip(re->req_byte, sizeof(re->req_byte)); + internal_re->name_table_offset = + (pcre_uint16) byteflip(re->name_table_offset, + sizeof(re->name_table_offset)); + internal_re->name_entry_size = + (pcre_uint16) byteflip(re->name_entry_size, sizeof(re->name_entry_size)); + internal_re->name_count = + (pcre_uint16) byteflip(re->name_count, sizeof(re->name_count)); + + if (study != NULL) { + *internal_study = *study; /* To copy other fields */ + internal_study->size = byteflip(study->size, sizeof(study->size)); + internal_study->options = byteflip(study->options, sizeof(study->options)); + } + + return internal_re; +} + +/* End of pcre_tryflipped.c */ + + +/* pcre_fullinfo.c */ + +/* This module contains the external function pcre_fullinfo(), which returns +information about a compiled pattern. */ + + + +/************************************************* +* Return info about compiled pattern * +*************************************************/ + +/* This is a newer "info" function which has an extensible interface so +that additional items can be added compatibly. + +Arguments: + argument_re points to compiled code + extra_data points extra data, or NULL + what what information is required + where where to put the information + +Returns: 0 if data returned, negative on error +*/ + +int +pcre_fullinfo(const pcre * argument_re, const pcre_extra * extra_data, int what, + void *where) +{ + real_pcre internal_re; + pcre_study_data internal_study; + const real_pcre *re = (const real_pcre *) argument_re; + const pcre_study_data *study = NULL; + + if (re == NULL || where == NULL) + return PCRE_ERROR_NULL; + + if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) + study = (const pcre_study_data *) extra_data->study_data; + + if (re->magic_number != MAGIC_NUMBER) { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) + return PCRE_ERROR_BADMAGIC; + if (study != NULL) + study = &internal_study; + } + + switch (what) { + case PCRE_INFO_OPTIONS: + *((unsigned long int *) where) = re->options & PUBLIC_OPTIONS; + break; + + case PCRE_INFO_SIZE: + *((size_t *) where) = re->size; + break; + + case PCRE_INFO_STUDYSIZE: + *((size_t *) where) = (study == NULL) ? 0 : study->size; + break; + + case PCRE_INFO_CAPTURECOUNT: + *((int *) where) = re->top_bracket; + break; + + case PCRE_INFO_BACKREFMAX: + *((int *) where) = re->top_backref; + break; + + case PCRE_INFO_FIRSTBYTE: + *((int *) where) = + ((re->options & PCRE_FIRSTSET) != 0) ? re->first_byte : + ((re->options & PCRE_STARTLINE) != 0) ? -1 : -2; + break; + + /* Make sure we pass back the pointer to the bit vector in the external + block, not the internal copy (with flipped integer fields). */ + + case PCRE_INFO_FIRSTTABLE: + *((const uschar **) where) = + (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0) ? + ((const pcre_study_data *) extra_data->study_data)->start_bits : NULL; + break; + + case PCRE_INFO_LASTLITERAL: + *((int *) where) = ((re->options & PCRE_REQCHSET) != 0) ? re->req_byte : -1; + break; + + case PCRE_INFO_NAMEENTRYSIZE: + *((int *) where) = re->name_entry_size; + break; + + case PCRE_INFO_NAMECOUNT: + *((int *) where) = re->name_count; + break; + + case PCRE_INFO_NAMETABLE: + *((const uschar **) where) = (const uschar *) re + re->name_table_offset; + break; + + case PCRE_INFO_DEFAULT_TABLES: + *((const uschar **) where) = (const uschar *) (_pcre_default_tables); + break; + + default: + return PCRE_ERROR_BADOPTION; + } + + return 0; +} + +/* End of pcre_fullinfo.c */ + + +/* pcre_get.c */ + /* This module contains some convenience functions for extracting substrings from the subject string after a regex match has succeeded. The original idea -for these functions came from Scott Wimer . */ +for these functions came from Scott Wimer. */ + + +/************************************************* +* Find number for named string * +*************************************************/ + +/* This function is used by the two extraction functions below, as well +as being generally available. + +Arguments: + code the compiled regex + stringname the name whose number is required + +Returns: the number of the named parentheses, or a negative number + (PCRE_ERROR_NOSUBSTRING) if not found +*/ + +int +pcre_get_stringnumber(const pcre * code, const char *stringname) +{ + int rc; + int entrysize; + int top, bot; + uschar *nametable; + + if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) + return rc; + if (top <= 0) + return PCRE_ERROR_NOSUBSTRING; + + if ((rc = + pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) + return rc; + if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) + return rc; + + bot = 0; + while (top > bot) { + int mid = (top + bot) / 2; + uschar *entry = nametable + entrysize * mid; + int c = strcmp(stringname, (char *) (entry + 2)); + if (c == 0) + return (entry[0] << 8) + entry[1]; + if (c > 0) + bot = mid + 1; + else + top = mid; + } + + return PCRE_ERROR_NOSUBSTRING; +} + /************************************************* @@ -839,8 +1187,251 @@ pcre_copy_substring(const char *subject, int *ovector, int stringcount, -/* End of get.c */ -/* maketables.c */ +/************************************************* +* Copy named captured string to given buffer * +*************************************************/ + +/* This function copies a single captured substring into a given buffer, +identifying it by name. + +Arguments: + code the compiled regex + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringname the name of the required substring + buffer where to put the substring + size the size of the buffer + +Returns: if successful: + the length of the copied string, not including the zero + that is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) buffer too small + PCRE_ERROR_NOSUBSTRING (-7) no such captured substring +*/ + +int +pcre_copy_named_substring(const pcre * code, const char *subject, int *ovector, + int stringcount, const char *stringname, char *buffer, + int size) +{ + int n = pcre_get_stringnumber(code, stringname); + if (n <= 0) + return n; + return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size); +} + + + +/************************************************* +* Copy all captured strings to new store * +*************************************************/ + +/* This function gets one chunk of store and builds a list of pointers and all +of the captured substrings in it. A NULL pointer is put on the end of the list. + +Arguments: + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + listptr set to point to the list of pointers + +Returns: if successful: 0 + if not successful: + PCRE_ERROR_NOMEMORY (-6) failed to get store +*/ + +int + pcre_get_substring_list(const char *subject, int *ovector, int stringcount, + const char ***listptr); + +int +pcre_get_substring_list(const char *subject, int *ovector, int stringcount, + const char ***listptr) +{ + int i; + int size = sizeof(char *); + int double_count = stringcount * 2; + char **stringlist; + char *p; + + for (i = 0; i < double_count; i += 2) + size += sizeof(char *) + ovector[i + 1] - ovector[i] + 1; + + stringlist = (char **) malloc(size); + if (stringlist == NULL) + return PCRE_ERROR_NOMEMORY; + + *listptr = (const char **) stringlist; + p = (char *) (stringlist + stringcount + 1); + + for (i = 0; i < double_count; i += 2) { + int len = ovector[i + 1] - ovector[i]; + memcpy(p, subject + ovector[i], len); + *stringlist++ = p; + p += len; + *p++ = 0; + } + + *stringlist = NULL; + return 0; +} + + + +/************************************************* +* Free store obtained by get_substring_list * +*************************************************/ + +/* This function exists for the benefit of people calling PCRE from non-C +programs that can call its functions, but not free() or free() directly. + +Argument: the result of a previous pcre_get_substring_list() +Returns: nothing +*/ +void + pcre_free_substring_list(const char **pointer); + +void +pcre_free_substring_list(const char **pointer) +{ + free((void *) pointer); +} + + + +/************************************************* +* Copy captured string to new store * +*************************************************/ + +/* This function copies a single captured substring into a piece of new +store + +Arguments: + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringnumber the number of the required substring + stringptr where to put a pointer to the substring + +Returns: if successful: + the length of the string, not including the zero that + is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) failed to get store + PCRE_ERROR_NOSUBSTRING (-7) substring not present +*/ + +int + pcre_get_substring(const char *subject, int *ovector, int stringcount, + int stringnumber, const char **stringptr); + +int +pcre_get_substring(const char *subject, int *ovector, int stringcount, + int stringnumber, const char **stringptr) +{ + int yield; + char *substring; + if (stringnumber < 0 || stringnumber >= stringcount) + return PCRE_ERROR_NOSUBSTRING; + stringnumber *= 2; + yield = ovector[stringnumber + 1] - ovector[stringnumber]; + substring = (char *) malloc(yield + 1); + if (substring == NULL) + return PCRE_ERROR_NOMEMORY; + memcpy(substring, subject + ovector[stringnumber], yield); + substring[yield] = 0; + *stringptr = substring; + return yield; +} + + + +/************************************************* +* Copy named captured string to new store * +*************************************************/ + +/* This function copies a single captured substring, identified by name, into +new store. + +Arguments: + code the compiled regex + subject the subject string that was matched + ovector pointer to the offsets table + stringcount the number of substrings that were captured + (i.e. the yield of the pcre_exec call, unless + that was zero, in which case it should be 1/3 + of the offset table size) + stringname the name of the required substring + stringptr where to put the pointer + +Returns: if successful: + the length of the copied string, not including the zero + that is put on the end; can be zero + if not successful: + PCRE_ERROR_NOMEMORY (-6) couldn't get memory + PCRE_ERROR_NOSUBSTRING (-7) no such captured substring +*/ + +int + pcre_get_named_substring(const pcre * code, const char *subject, int *ovector, + int stringcount, const char *stringname, + const char **stringptr); + +int +pcre_get_named_substring(const pcre * code, const char *subject, int *ovector, + int stringcount, const char *stringname, + const char **stringptr) +{ + int n = pcre_get_stringnumber(code, stringname); + if (n <= 0) + return n; + return pcre_get_substring(subject, ovector, stringcount, n, stringptr); +} + + + + +/************************************************* +* Free store obtained by get_substring * +*************************************************/ + +/* This function exists for the benefit of people calling PCRE from non-C +programs that can call its functions, but not free() or free() directly. + +Argument: the result of a previous pcre_get_substring() +Returns: nothing +*/ + +void + pcre_free_substring(const char *pointer); + +void +pcre_free_substring(const char *pointer) +{ + free((void *) pointer); +} + +/* End of pcre_get.c */ + +/* pcre_maketables.c */ + +/* This module contains the external function pcre_maketables(), which builds +character tables for PCRE in the current locale. The file is compiled on its +own as part of the PCRE library. However, it is also included in the +compilation of dftables.c, in which case the macro DFTABLES is defined. */ + + /************************************************* * Create PCRE character tables * *************************************************/ @@ -861,11 +1452,7 @@ pcre_maketables(void) unsigned char *yield, *p; int i; -#ifndef DFTABLES yield = (unsigned char *) malloc(tables_length); -#else - yield = (unsigned char *) malloc(tables_length); -#endif if (yield == NULL) return NULL; @@ -947,8 +1534,16 @@ within regexes. */ return yield; } -/* End of maketables.c */ -/* study.c */ +/* End of pcre_maketables.c */ + +/* pcre_study.c */ + + +/* This module contains the external function pcre_study(), along with local +supporting functions. */ + + + /************************************************* * Set a bit and maybe its alternate case * *************************************************/ @@ -966,7 +1561,7 @@ Returns: nothing */ static void -set_bit(uschar * start_bits, int c, BOOL caseless, compile_data * cd) +set_bit(uschar * start_bits, unsigned int c, BOOL caseless, compile_data * cd) { start_bits[c / 8] |= (1 << (c & 7)); if (caseless && (cd->ctypes[c] & ctype_letter) != 0) @@ -1029,7 +1624,7 @@ the pcre module can use all the optimization it can get). */ /* Skip over callout */ case OP_CALLOUT: - tcode += 2; + tcode += 2 + 2 * LINK_SIZE; break; /* Skip over extended extraction bracket number */ @@ -1090,11 +1685,10 @@ the pcre module can use all the optimization it can get). */ /* At least one single char sets the bit and stops */ case OP_EXACT: /* Fall through */ - tcode++; - - case OP_CHARS: /* Fall through */ - tcode++; + tcode += 2; + case OP_CHAR: + case OP_CHARNC: case OP_PLUS: case OP_MINPLUS: set_bit(start_bits, tcode[1], caseless, cd); @@ -1298,14 +1892,15 @@ Returns: pointer to a pcre_extra block, with study_data filled in and the NULL on error or if no optimization possible */ -EXPORT pcre_extra * +pcre_extra * pcre_study(const pcre * external_re, int options, const char **errorptr) { uschar start_bits[32]; pcre_extra *extra; pcre_study_data *study; + const uschar *tables; const real_pcre *re = (const real_pcre *) external_re; - uschar *code = (uschar *) re + sizeof(real_pcre) + + uschar *code = (uschar *) re + re->name_table_offset + (re->name_count * re->name_entry_size); compile_data compile_block; @@ -1328,12 +1923,17 @@ at present. */ if ((re->options & (PCRE_ANCHORED | PCRE_FIRSTSET | PCRE_STARTLINE)) != 0) return NULL; -/* Set the character tables in the block which is passed around */ +/* Set the character tables in the block that is passed around */ + + tables = re->tables; + if (tables == NULL) + (void) pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *) (&tables)); - compile_block.lcc = re->tables + lcc_offset; - compile_block.fcc = re->tables + fcc_offset; - compile_block.cbits = re->tables + cbits_offset; - compile_block.ctypes = re->tables + ctypes_offset; + compile_block.lcc = tables + lcc_offset; + compile_block.fcc = tables + fcc_offset; + compile_block.cbits = tables + cbits_offset; + compile_block.ctypes = tables + ctypes_offset; /* See if we can find a fixed set of initial characters for the pattern. */ @@ -1367,9 +1967,18 @@ don't have to change that code. */ return extra; } -/* End of study.c */ -/* pcre.c */ -#define DPRINTF(p) /*nothing */ +/* End of pcre_study.c */ + +/* pcre_compile.c */ + +/* This module contains the external function pcre_compile(), along with +supporting internal functions that are not used by other modules. */ + + + +/************************************************* +* Code parameters and static tables * +*************************************************/ /* Maximum number of items on the nested bracket stacks at compile time. This applies to the nesting of all kinds of parentheses. It does not limit @@ -1380,55 +1989,52 @@ compile time. */ #define BRASTACK_SIZE 200 -/* Maximum number of ints of offset to save on the stack for recursive calls. -If the offset vector is bigger, malloc is used. This should be a multiple of 3, -because the offset vector is always a multiple of 3 long. */ - -#define REC_STACK_SAVE_MAX 30 - - -/* The number of bytes in a literal character string above which we can't add -any more is set at 250 in order to allow for UTF-8 characters. (In theory it -could be 255 when UTF-8 support is excluded, but that means that some of the -test output would be different, which just complicates things.) */ - -#define MAXLIT 250 - - -/* The maximum remaining length of subject we are prepared to search for a -req_byte match. */ - -#define REQ_BYTE_MAX 1000 - - -/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that -the definition is next to the definition of the opcodes in internal.h. */ - -static const uschar OP_lengths[] = { OP_LENGTHS }; - -/* Min and max values for the common repeats; for the maxima, 0 => infinity */ - -static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; - /* Table for handling escaped characters in the range '0'-'z'. Positive returns are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ +#if !EBCDIC /* This is the "normal" table for ASCII systems */ static const short int escapes[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ - 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ - 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ + -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ + -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ - 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ + -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ 0, 0, -ESC_z /* x - z */ }; +#else /* This is the "abnormal" table for EBCDIC systems */ +static const short int escapes[] = { +/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', +/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, +/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', +/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, +/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', +/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', +/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, +/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, +/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, +/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, +/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, +/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0, +/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', +/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, +/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, +/* D8 */ -ESC_Q, 0, 0, 0, 0, 0, 0, 0, +/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, +/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0, +/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 +}; +#endif /* Tables of names of POSIX character classes and their lengths. The list is @@ -1466,6 +2072,71 @@ static const int posix_class_maps[] = { cbit_xdigit, -1, -1 /* xdigit */ }; + +/* The texts of compile-time error messages. These are "char *" because they +are passed to the outside world. */ + +static const char *error_texts[] = { + "no error", + "\\ at end of pattern", + "\\c at end of pattern", + "unrecognized character follows \\", + "numbers out of order in {} quantifier", + /* 5 */ + "number too big in {} quantifier", + "missing terminating ] for character class", + "invalid escape sequence in character class", + "range out of order in character class", + "nothing to repeat", + /* 10 */ + "operand of unlimited repeat could match the empty string", + "internal error: unexpected repeat", + "unrecognized character after (?", + "POSIX named classes are supported only within a class", + "missing )", + /* 15 */ + "reference to non-existent subpattern", + "erroffset passed as NULL", + "unknown option bit(s) set", + "missing ) after comment", + "parentheses nested too deeply", + /* 20 */ + "regular expression too large", + "failed to get memory", + "unmatched parentheses", + "internal error: code overflow", + "unrecognized character after (?<", + /* 25 */ + "lookbehind assertion is not fixed length", + "malformed number after (?(", + "conditional group contains more than two branches", + "assertion expected after (?(", + "(?R or (?digits must be followed by )", + /* 30 */ + "unknown POSIX class name", + "POSIX collating elements are not supported", + "this version of PCRE is not compiled with PCRE_UTF8 support", + "spare error", + "character value in \\x{...} sequence is too large", + /* 35 */ + "invalid condition (?(0)", + "\\C not allowed in lookbehind assertion", + "PCRE does not support \\L, \\l, \\N, \\U, or \\u", + "number after (?C is > 255", + "closing ) for (?C expected", + /* 40 */ + "recursive call could loop indefinitely", + "unrecognized character after (?P", + "syntax error after (?P", + "two named groups have the same name", + "invalid UTF-8 string", + /* 45 */ + "support for \\P, \\p, and \\X has not been compiled", + "malformed \\P or \\p sequence", + "unknown property name after \\P or \\p" +}; + + /* Table to identify digits and hex digits. This is used when compiling patterns. Note that the tables in chartables are dependent on the locale, and may mark arbitrary characters as digits - but the PCRE compiling code expects @@ -1482,6 +2153,7 @@ For convenience, we use the same bit definitions as in chartables: Then we can use ctype_digit and ctype_xdigit in the code. */ +#if !EBCDIC /* This is the "normal" case, for ASCII systems */ static const unsigned char digitab[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0- 7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8- 15 */ @@ -1517,78 +2189,85 @@ static const unsigned char digitab[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; /* 248-255 */ +#else /* This is the "abnormal" case, for EBCDIC systems */ +static const unsigned char digitab[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0- 7 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8- 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 16- 23 10 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 32- 39 20 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 40- 47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 48- 55 30 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 56- 63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - 71 40 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 72- | */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* & - 87 50 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 88- ¬ */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - -103 60 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 104- ? */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 112-119 70 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120- " */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, /* 128- g 80 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* h -143 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 144- p 90 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* q -159 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160- x A0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* y -175 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ^ -183 B0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 184-191 */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, /* { - G C0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* H -207 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* } - P D0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Q -223 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* \ - X E0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Y -239 */ + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 0 - 7 F0 */ + 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; /* 8 -255 */ + +static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ + 0x80, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, /* 0- 7 */ + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, /* 8- 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, /* 16- 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, /* 32- 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 40- 47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 48- 55 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 56- 63 */ + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - 71 */ + 0x00, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x80, /* 72- | */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* & - 87 */ + 0x00, 0x00, 0x00, 0x80, 0x80, 0x80, 0x00, 0x00, /* 88- ¬ */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - -103 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x80, /* 104- ? */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 112-119 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120- " */ + 0x00, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x12, /* 128- g */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* h -143 */ + 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* 144- p */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* q -159 */ + 0x00, 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* 160- x */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* y -175 */ + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ^ -183 */ + 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 184-191 */ + 0x80, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x12, /* { - G */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* H -207 */ + 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* } - P */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Q -223 */ + 0x00, 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* \ - X */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Y -239 */ + 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, /* 0 - 7 */ + 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; /* 8 -255 */ +#endif /* Definition to allow mutual recursion */ static BOOL -compile_regex(int, int, int *, uschar **, const uschar **, const char **, - BOOL, int, int *, int *, branch_chain *, compile_data *); - -/* Structure for building a chain of data that actually lives on the -stack, for holding the values of the subject pointer at the start of each -subpattern, so as to detect when an empty string has been matched by a -subpattern - to break infinite loops. When NO_RECURSE is set, these blocks -are on the heap, not on the stack. */ - -typedef struct eptrblock { - struct eptrblock *epb_prev; - const uschar *epb_saved_eptr; -} eptrblock; - -/* Flag bits for the match() function */ - -#define match_condassert 0x01 /* Called to check a condition assertion */ -#define match_isgroup 0x02 /* Set if start of bracketed group */ - -/* Non-error returns from the match() function. Error returns are externally -defined PCRE_ERROR_xxx codes, which are all negative. */ - -#define MATCH_MATCH 1 -#define MATCH_NOMATCH 0 - - - -/************************************************* -* Global variables * -*************************************************/ - -/* PCRE is thread-clean and doesn't use any global variables in the normal -sense. However, it calls memory allocation and free functions via the four -indirections below, and it can optionally do callouts. These values can be -changed by the caller, but are shared between all threads. However, when -compiling for Virtual Pascal, things are done differently (see pcre.in). */ - -#ifndef VPCOMPAT -#ifdef __cplusplus -extern "C" void *(*pcre_malloc) (size_t) = malloc; -extern "C" void (*pcre_free) (void *) = free; -extern "C" void *(*pcre_stack_malloc) (size_t) = malloc; -extern "C" void (*pcre_stack_free) (void *) = free; -extern "C" int (*pcre_callout) (pcre_callout_block *) = NULL; -#else -void *(*pcre_malloc) (size_t) = malloc; -void (*pcre_free) (void *) = free; -void *(*pcre_stack_malloc) (size_t) = malloc; -void (*pcre_stack_free) (void *) = free; -int (*pcre_callout) (pcre_callout_block *) = NULL; -#endif -#endif - - -/************************************************* -* Macros and tables for character handling * -*************************************************/ +compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int, + int *, int *, branch_chain *, compile_data *); -/* When UTF-8 encoding is being used, a character is no longer just a single -byte. The macros for character handling generate simple sequences when used in -byte-mode, and more complicated ones for UTF-8 characters. */ - -#define GETCHAR(c, eptr) c = *eptr; -#define GETCHARINC(c, eptr) c = *eptr++; -#define GETCHARINCTEST(c, eptr) c = *eptr++; -#define GETCHARLEN(c, eptr, len) c = *eptr; -#define BACKCHAR(eptr) /************************************************* @@ -1602,19 +2281,19 @@ a positive value greater than 255 may be returned. On entry, ptr is pointing at the \. On exit, it is on the final character of the escape sequence. Arguments: - ptrptr points to the pattern position pointer - errorptr points to the pointer to the error message - bracount number of previous extracting brackets - options the options bits - isclass TRUE if inside a character class - -Returns: zero or positive => a data character - negative => a special escape sequence - on error, errorptr is set + ptrptr points to the pattern position pointer + errorcodeptr points to the errorcode variable + bracount number of previous extracting brackets + options the options bits + isclass TRUE if inside a character class + +Returns: zero or positive => a data character + negative => a special escape sequence + on error, errorptr is set */ static int -check_escape(const uschar ** ptrptr, const char **errorptr, int bracount, +check_escape(const uschar ** ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass) { const uschar *ptr = *ptrptr; @@ -1624,17 +2303,24 @@ check_escape(const uschar ** ptrptr, const char **errorptr, int bracount, c = *(++ptr); if (c == 0) - *errorptr = ERR1; + *errorcodeptr = ERR1; /* Non-alphamerics are literals. For digits or letters, do an initial lookup in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ +#if !EBCDIC /* ASCII coding */ else if (c < '0' || c > 'z') { } /* Not alphameric */ else if ((i = escapes[c - '0']) != 0) c = i; +#else /* EBCDIC coding */ + else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) { + } /* Not alphameric */ + else if ((i = escapes[c - 0x48]) != 0) + c = i; +#endif /* Escapes that need further processing, or are illegal. */ @@ -1647,12 +2333,9 @@ Otherwise further processing may be required. */ case 'l': case 'L': case 'N': - case 'p': - case 'P': case 'u': case 'U': - case 'X': - *errorptr = ERR37; + *errorcodeptr = ERR37; break; /* The handling of escape sequences consisting of a string of digits @@ -1720,9 +2403,15 @@ Otherwise further processing may be required. */ while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { int cc; /* Some compilers don't like ++ */ cc = *(++ptr); /* in initializers */ +#if !EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); +#else /* EBCDIC coding */ + if (cc <= 'z') + cc += 64; /* Convert to upper case */ + c = c * 16 + cc - ((cc >= '0') ? '0' : ('A' - 10)); +#endif } break; @@ -1731,7 +2420,7 @@ Otherwise further processing may be required. */ case 'c': c = *(++ptr); if (c == 0) { - *errorptr = ERR2; + *errorcodeptr = ERR2; return 0; } @@ -1739,9 +2428,15 @@ Otherwise further processing may be required. */ is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ +#if !EBCDIC /* ASCII coding */ if (c >= 'a' && c <= 'z') c -= 32; c ^= 0x40; +#else /* EBCDIC coding */ + if (c >= 'a' && c <= 'z') + c += 64; + c ^= 0xC0; +#endif break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any @@ -1754,7 +2449,7 @@ Otherwise further processing may be required. */ if ((options & PCRE_EXTRA) != 0) switch (c) { default: - *errorptr = ERR3; + *errorcodeptr = ERR3; break; } break; @@ -1767,6 +2462,103 @@ Otherwise further processing may be required. */ +#ifdef SUPPORT_UCP +/************************************************* +* Handle \P and \p * +*************************************************/ + +/* This function is called after \P or \p has been encountered, provided that +PCRE is compiled with support for Unicode properties. On entry, ptrptr is +pointing at the P or p. On exit, it is pointing at the final character of the +escape sequence. + +Argument: + ptrptr points to the pattern position pointer + negptr points to a boolean that is set TRUE for negation else FALSE + errorcodeptr points to the error code variable + +Returns: value from ucp_type_table, or -1 for an invalid type +*/ + +static int +get_ucp(const uschar ** ptrptr, BOOL * negptr, int *errorcodeptr) +{ + int c, i, bot, top; + const uschar *ptr = *ptrptr; + char name[4]; + + c = *(++ptr); + if (c == 0) + goto ERROR_RETURN; + + *negptr = FALSE; + +/* \P or \p can be followed by a one- or two-character name in {}, optionally +preceded by ^ for negation. */ + + if (c == '{') { + if (ptr[1] == '^') { + *negptr = TRUE; + ptr++; + } + for (i = 0; i <= 2; i++) { + c = *(++ptr); + if (c == 0) + goto ERROR_RETURN; + if (c == '}') + break; + name[i] = c; + } + if (c != '}') { /* Try to distinguish error cases */ + while (*(++ptr) != 0 && *ptr != '}') ; + if (*ptr == '}') + goto UNKNOWN_RETURN; + else + goto ERROR_RETURN; + } + name[i] = 0; + } + +/* Otherwise there is just one following character */ + + else { + name[0] = c; + name[1] = 0; + } + + *ptrptr = ptr; + +/* Search for a recognized property name using binary chop */ + + bot = 0; + top = _pcre_utt_size; + + while (bot < top) { + i = (bot + top) / 2; + c = strcmp(name, _pcre_utt[i].name); + if (c == 0) + return _pcre_utt[i].value; + if (c > 0) + bot = i + 1; + else + top = i; + } + +UNKNOWN_RETURN: + *errorcodeptr = ERR47; + *ptrptr = ptr; + return -1; + +ERROR_RETURN: + *errorcodeptr = ERR46; + *ptrptr = ptr; + return -1; +} +#endif + + + + /************************************************* * Check for counted repeat * *************************************************/ @@ -1816,25 +2608,34 @@ after is_counted_repeat() has confirmed that a repeat-count quantifier exists, so the syntax is guaranteed to be correct, but we need to check the values. Arguments: - p pointer to first char after '{' - minp pointer to int for min - maxp pointer to int for max - returned as -1 if no max - errorptr points to pointer to error message - -Returns: pointer to '}' on success; - current ptr on error, with errorptr set + p pointer to first char after '{' + minp pointer to int for min + maxp pointer to int for max + returned as -1 if no max + errorcodeptr points to error code variable + +Returns: pointer to '}' on success; + current ptr on error, with errorcodeptr set non-zero */ static const uschar * -read_repeat_counts(const uschar * p, int *minp, int *maxp, - const char **errorptr) +read_repeat_counts(const uschar * p, int *minp, int *maxp, int *errorcodeptr) { int min = 0; int max = -1; +/* Read the minimum value and do a paranoid check: a negative value indicates +an integer overflow. */ + while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; + if (min < 0 || min > 65535) { + *errorcodeptr = ERR5; + return p; + } + +/* Read the maximum value if there is one, and again do a paranoid on its size. +Also, max must not be less than min. */ if (*p == '}') max = min; @@ -1843,22 +2644,22 @@ read_repeat_counts(const uschar * p, int *minp, int *maxp, max = 0; while ((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; + if (max < 0 || max > 65535) { + *errorcodeptr = ERR5; + return p; + } if (max < min) { - *errorptr = ERR4; + *errorcodeptr = ERR4; return p; } } } -/* Do paranoid checks, then fill in the required variables, and pass back the -pointer to the terminating '}'. */ +/* Fill in the required variables, and pass back the pointer to the terminating +'}'. */ - if (min > 65535 || max > 65535) - *errorptr = ERR5; - else { - *minp = min; - *maxp = max; - } + *minp = min; + *maxp = max; return p; } @@ -1871,18 +2672,22 @@ pointer to the terminating '}'. */ /* This is called by several functions that scan a compiled expression looking for a fixed first character, or an anchoring op code etc. It skips over things that do not influence this. For some calls, a change of option is important. +For some calls, it makes sense to skip negative forward and all backward +assertions, and also the \b assertion; for others it does not. Arguments: - code pointer to the start of the group - options pointer to external options - optbit the option bit whose changing is significant, or - zero if none are + code pointer to the start of the group + options pointer to external options + optbit the option bit whose changing is significant, or + zero if none are + skipassert TRUE if certain assertions are to be skipped -Returns: pointer to the first significant opcode +Returns: pointer to the first significant opcode */ static const uschar * -first_significant_code(const uschar * code, int *options, int optbit) +first_significant_code(const uschar * code, int *options, int optbit, + BOOL skipassert) { for (;;) { switch ((int) *code) { @@ -1895,17 +2700,24 @@ first_significant_code(const uschar * code, int *options, int optbit) case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: + if (!skipassert) + return code; do code += GET(code, 1); while (*code == OP_ALT); + code += _pcre_OP_lengths[*code]; + break; + + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + if (!skipassert) + return code; /* Fall through */ case OP_CALLOUT: case OP_CREF: case OP_BRANUMBER: - case OP_WORD_BOUNDARY: - case OP_NOT_WORD_BOUNDARY: - code += OP_lengths[*code]; + code += _pcre_OP_lengths[*code]; break; default: @@ -2010,17 +2822,15 @@ branch, check the length against that of the other branches. */ case OP_DOLL: case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: - cc += OP_lengths[*cc]; + cc += _pcre_OP_lengths[*cc]; break; - /* Handle char strings. In UTF-8 mode we must count characters, not bytes. - This requires a scan of the string, unfortunately. We assume valid UTF-8 - strings, so all we do is reduce the length by one for every byte whose bits - are 10xxxxxx. */ + /* Handle literal characters */ - case OP_CHARS: - branchlength += *(++cc); - cc += *cc + 1; + case OP_CHAR: + case OP_CHARNC: + branchlength++; + cc += 2; break; /* Handle exact repetitions. The count is already in characters, but we @@ -2038,6 +2848,11 @@ branch, check the length against that of the other branches. */ /* Handle single-char matchers */ + case OP_PROP: + case OP_NOTPROP: + cc++; + /* Fall through */ + case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: @@ -2117,17 +2932,15 @@ find_bracket(const uschar * code, BOOL utf8, int number) register int c = *code; if (c == OP_END) return NULL; - else if (c == OP_CHARS) - code += code[1] + OP_lengths[c]; else if (c > OP_BRA) { int n = c - OP_BRA; if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2 + LINK_SIZE); if (n == number) return (uschar *) code; - code += OP_lengths[OP_BRA]; + code += _pcre_OP_lengths[OP_BRA]; } else { - code += OP_lengths[c]; + code += _pcre_OP_lengths[c]; } } @@ -2160,12 +2973,10 @@ find_recurse(const uschar * code, BOOL utf8) return NULL; else if (c == OP_RECURSE) return code; - else if (c == OP_CHARS) - code += code[1] + OP_lengths[c]; else if (c > OP_BRA) { - code += OP_lengths[OP_BRA]; + code += _pcre_OP_lengths[OP_BRA]; } else { - code += OP_lengths[c]; + code += _pcre_OP_lengths[c]; } } @@ -2195,9 +3006,10 @@ static BOOL could_be_empty_branch(const uschar * code, const uschar * endcode, BOOL utf8) { register int c; - for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0); + for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE); code < endcode; - code = first_significant_code(code + OP_lengths[c], NULL, 0)) { + code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) + { const uschar *ccode; c = *code; @@ -2226,10 +3038,12 @@ could_be_empty_branch(const uschar * code, const uschar * endcode, BOOL utf8) switch (c) { /* Check for quantifiers after a class */ + case OP_CLASS: case OP_NCLASS: ccode = code + 33; + switch (*ccode) { case OP_CRSTAR: /* These could be empty; continue */ case OP_CRMINSTAR: @@ -2252,6 +3066,9 @@ could_be_empty_branch(const uschar * code, const uschar * endcode, BOOL utf8) /* Opcodes that must match a character */ + case OP_PROP: + case OP_NOTPROP: + case OP_EXTUNI: case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: @@ -2260,7 +3077,8 @@ could_be_empty_branch(const uschar * code, const uschar * endcode, BOOL utf8) case OP_WORDCHAR: case OP_ANY: case OP_ANYBYTE: - case OP_CHARS: + case OP_CHAR: + case OP_CHARNC: case OP_NOT: case OP_PLUS: case OP_MINPLUS: @@ -2426,6 +3244,109 @@ adjust_recurse(uschar * group, int adjust, BOOL utf8, compile_data * cd) +/************************************************* +* Insert an automatic callout point * +*************************************************/ + +/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert +callout points before each pattern item. + +Arguments: + code current code pointer + ptr current pattern pointer + cd pointers to tables etc + +Returns: new code pointer +*/ + +static uschar * +auto_callout(uschar * code, const uschar * ptr, compile_data * cd) +{ + *code++ = OP_CALLOUT; + *code++ = 255; + PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ + return code + 2 * LINK_SIZE; +} + + + +/************************************************* +* Complete a callout item * +*************************************************/ + +/* A callout item contains the length of the next item in the pattern, which +we can't fill in till after we have reached the relevant point. This is used +for both automatic and manual callouts. + +Arguments: + previous_callout points to previous callout item + ptr current pattern pointer + cd pointers to tables etc + +Returns: nothing +*/ + +static void +complete_callout(uschar * previous_callout, const uschar * ptr, + compile_data * cd) +{ + int length = ptr - cd->start_pattern - GET(previous_callout, 2); + PUT(previous_callout, 2 + LINK_SIZE, length); +} + + + +#ifdef SUPPORT_UCP +/************************************************* +* Get othercase range * +*************************************************/ + +/* This function is passed the start and end of a class range, in UTF-8 mode +with UCP support. It searches up the characters, looking for internal ranges of +characters in the "other" case. Each call returns the next one, updating the +start address. + +Arguments: + cptr points to starting character value; updated + d end value + ocptr where to put start of othercase range + odptr where to put end of othercase range + +Yield: TRUE when range returned; FALSE when no more +*/ + +static BOOL +get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) +{ + int c, chartype, othercase, next; + + for (c = *cptr; c <= d; c++) { + if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) + break; + } + + if (c > d) + return FALSE; + + *ocptr = othercase; + next = othercase + 1; + + for (++c; c <= d; c++) { + if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L || + othercase != next) + break; + next++; + } + + *odptr = next - 1; + *cptr = c; + + return TRUE; +} +#endif /* SUPPORT_UCP */ + + /************************************************* * Compile one branch * *************************************************/ @@ -2437,33 +3358,33 @@ bits. Arguments: optionsptr pointer to the option bits brackets points to number of extracting brackets used - code points to the pointer to the current code point + codeptr points to the pointer to the current code point ptrptr points to the current pattern pointer - errorptr points to pointer to error message + errorcodeptr points to error code variable firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) reqbyteptr set to the last literal character required, else < 0 bcptr points to current branch chain cd contains pointers to tables etc. Returns: TRUE on success - FALSE, with *errorptr set on error + FALSE, with *errorcodeptr set non-zero on error */ static BOOL compile_branch(int *optionsptr, int *brackets, uschar ** codeptr, - const uschar ** ptrptr, const char **errorptr, int *firstbyteptr, + const uschar ** ptrptr, int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain * bcptr, compile_data * cd) { int repeat_type, op_type; int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ int bravalue = 0; - int length; int greedy_default, greedy_non_default; int firstbyte, reqbyte; int zeroreqbyte, zerofirstbyte; int req_caseopt, reqvary, tempreqvary; int condcount = 0; int options = *optionsptr; + int after_manual_callout = 0; register int c; register uschar *code = *codeptr; uschar *tempcode; @@ -2472,7 +3393,8 @@ compile_branch(int *optionsptr, int *brackets, uschar ** codeptr, const uschar *ptr = *ptrptr; const uschar *tempptr; uschar *previous = NULL; - uschar class[32]; + uschar *previous_callout = NULL; + uschar classbits[32]; BOOL utf8 = FALSE; @@ -2481,7 +3403,7 @@ compile_branch(int *optionsptr, int *brackets, uschar ** codeptr, greedy_default = ((options & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; -/* Initialize no first char, no required char. REQ_UNSET means "no char +/* Initialize no first byte, no required byte. REQ_UNSET means "no char matching encountered yet". It gets changed to REQ_NONE if we hit something that matches a non-fixed char first char; reqbyte just remains unset if we never find one. @@ -2496,7 +3418,7 @@ item types that can be repeated set these backoff variables appropriately. */ /* The variable req_caseopt contains either the REQ_CASELESS value or zero, according to the current setting of the caseless flag. REQ_CASELESS is a bit value > 255. It is added into the firstbyte or reqbyte variables to record the -case status of the value. */ +case status of the value. This is used only for ASCII characters. */ req_caseopt = ((options & PCRE_CASELESS) != 0) ? REQ_CASELESS : 0; @@ -2505,6 +3427,7 @@ case status of the value. */ for (;; ptr++) { BOOL negate_class; BOOL possessive_quantifier; + BOOL is_quantifier; int class_charcount; int class_lastchar; int newoptions; @@ -2512,10 +3435,46 @@ case status of the value. */ int skipbytes; int subreqbyte; int subfirstbyte; + int mclength; + uschar mcbuffer[8]; + + /* Next byte in the pattern */ c = *ptr; - if (inescq && c != 0) - goto NORMAL_CHAR; + + /* If in \Q...\E, check for the end; if not, we have a literal */ + + if (inescq && c != 0) { + if (c == '\\' && ptr[1] == 'E') { + inescq = FALSE; + ptr++; + continue; + } else { + if (previous_callout != NULL) { + complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + if ((options & PCRE_AUTO_CALLOUT) != 0) { + previous_callout = code; + code = auto_callout(code, ptr, cd); + } + goto NORMAL_CHAR; + } + } + + /* Fill in length of a previous callout, except when the next thing is + a quantifier. */ + + is_quantifier = c == '*' || c == '+' || c == '?' || + (c == '{' && is_counted_repeat(ptr + 1)); + + if (!is_quantifier && previous_callout != NULL && + after_manual_callout-- <= 0) { + complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + + /* In extended mode, skip white space and comments */ if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) @@ -2529,6 +3488,13 @@ case status of the value. */ } } + /* No auto callout for quantifiers. */ + + if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) { + previous_callout = code; + code = auto_callout(code, ptr, cd); + } + switch (c) { /* The branch terminates at end of string, |, or ). */ @@ -2590,7 +3556,7 @@ case status of the value. */ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && check_posix_syntax(ptr, &tempptr, cd)) { - *errorptr = (ptr[1] == ':') ? ERR13 : ERR31; + *errorcodeptr = (ptr[1] == ':') ? ERR13 : ERR31; goto FAILED; } @@ -2616,7 +3582,7 @@ case status of the value. */ character (< 256), because in that case the compiled code doesn't use the bit map. */ - memset(class, 0, 32 * sizeof(uschar)); + memset(classbits, 0, 32 * sizeof(uschar)); /* Process characters until ] is reached. By writing this as a "do" it means that an initial ] is taken as a data character. The first pass @@ -2651,7 +3617,7 @@ case status of the value. */ register const uschar *cbits = cd->cbits; if (ptr[1] != ':') { - *errorptr = ERR31; + *errorcodeptr = ERR31; goto FAILED; } @@ -2663,7 +3629,7 @@ case status of the value. */ posix_class = check_posix_name(ptr, tempptr - ptr); if (posix_class < 0) { - *errorptr = ERR30; + *errorcodeptr = ERR30; goto FAILED; } @@ -2686,15 +3652,19 @@ case status of the value. */ if (taboffset < 0) break; if (local_negate) { - for (c = 0; c < 32; c++) - class[c] |= ~cbits[c + taboffset]; + if (i == 0) + for (c = 0; c < 32; c++) + classbits[c] |= ~cbits[c + taboffset]; + else + for (c = 0; c < 32; c++) + classbits[c] &= ~cbits[c + taboffset]; if (blankclass) - class[1] |= 0x3c; + classbits[1] |= 0x3c; } else { for (c = 0; c < 32; c++) - class[c] |= cbits[c + taboffset]; + classbits[c] |= cbits[c + taboffset]; if (blankclass) - class[1] &= ~0x3c; + classbits[1] &= ~0x3c; } } @@ -2712,11 +3682,13 @@ case status of the value. */ character in them, so set class_charcount bigger than one. */ if (c == '\\') { - c = check_escape(&ptr, errorptr, *brackets, options, TRUE); + c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE); + if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ - - if (-c == ESC_Q) { /* Handle start of quoted string */ + else if (-c == ESC_X) + c = 'X'; /* \X is literal X in a class */ + else if (-c == ESC_Q) { /* Handle start of quoted string */ if (ptr[1] == '\\' && ptr[2] == 'E') { ptr += 2; /* avoid empty string */ } else @@ -2724,41 +3696,58 @@ case status of the value. */ continue; } - else if (c < 0) { + if (c < 0) { register const uschar *cbits = cd->cbits; - class_charcount = 10; /* Greater than 1 is what matters */ + class_charcount += 2; /* Greater than 1 is what matters */ switch (-c) { case ESC_d: for (c = 0; c < 32; c++) - class[c] |= cbits[c + cbit_digit]; + classbits[c] |= cbits[c + cbit_digit]; continue; case ESC_D: for (c = 0; c < 32; c++) - class[c] |= ~cbits[c + cbit_digit]; + classbits[c] |= ~cbits[c + cbit_digit]; continue; case ESC_w: for (c = 0; c < 32; c++) - class[c] |= cbits[c + cbit_word]; + classbits[c] |= cbits[c + cbit_word]; continue; case ESC_W: for (c = 0; c < 32; c++) - class[c] |= ~cbits[c + cbit_word]; + classbits[c] |= ~cbits[c + cbit_word]; continue; case ESC_s: for (c = 0; c < 32; c++) - class[c] |= cbits[c + cbit_space]; - class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ + classbits[c] |= cbits[c + cbit_space]; + classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ continue; case ESC_S: for (c = 0; c < 32; c++) - class[c] |= ~cbits[c + cbit_space]; - class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ + classbits[c] |= ~cbits[c + cbit_space]; + classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ + continue; + +#ifdef SUPPORT_UCP + case ESC_p: + case ESC_P: + { + BOOL negated; + int property = get_ucp(&ptr, &negated, errorcodeptr); + if (property < 0) + goto FAILED; + class_utf8 = TRUE; + *class_utf8data++ = ((-c == ESC_p) != negated) ? + XCL_PROP : XCL_NOTPROP; + *class_utf8data++ = property; + class_charcount -= 2; /* Not a < 256 character */ + } continue; +#endif /* Unrecognized escapes are faulted if PCRE is running in its strict mode. By default, for compatibility with Perl, they are @@ -2766,10 +3755,11 @@ case status of the value. */ default: if ((options & PCRE_EXTRA) != 0) { - *errorptr = ERR7; + *errorcodeptr = ERR7; goto FAILED; } c = *ptr; /* The final character */ + class_charcount -= 2; /* Undo the default count from above */ } } @@ -2787,7 +3777,7 @@ case status of the value. */ int d; ptr += 2; - d = *ptr; + d = *ptr; /* Not UTF-8 mode */ /* The second part of a range can be a single-character escape, but not any of the other escapes. Perl 5.6 treats a hyphen as a literal @@ -2795,13 +3785,16 @@ case status of the value. */ if (d == '\\') { const uschar *oldptr = ptr; - d = check_escape(&ptr, errorptr, *brackets, options, TRUE); + d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE); - /* \b is backslash; any other special means the '-' was literal */ + /* \b is backslash; \X is literal X; any other special means the '-' + was literal */ if (d < 0) { if (d == -ESC_b) d = '\b'; + else if (d == -ESC_X) + d = 'X'; else { ptr = oldptr - 2; goto LONE_SINGLE_CHARACTER; /* A few lines below */ @@ -2809,27 +3802,27 @@ case status of the value. */ } } - /* Check that the two values are in the correct order */ + /* The check that the two values are in the correct order happens in + the pre-pass. Optimize one-character ranges */ - if (d < c) { - *errorptr = ERR8; - goto FAILED; - } + if (d == c) + goto LONE_SINGLE_CHARACTER; /* A few lines below */ + + /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless + matching, we have to use an XCLASS with extra data items. Caseless + matching for characters > 127 is available only if UCP support is + available. */ - /* If d is greater than 255, we can't just use the bit map, so set up - for the UTF-8 supporting class type. If we are not caseless, we can - just set up a single range. If we are caseless, the characters < 256 - are handled with a bitmap, in order to get the case-insensitive - handling. */ - /* We use the bit map if the range is entirely < 255, or if part of it - is < 255 and matching is caseless. */ + /* We use the bit map for all cases when not in UTF-8 mode; else + ranges that lie entirely within 0-127 when there is UCP support; else + for partial ranges without UCP support. */ for (; c <= d; c++) { - class[c / 8] |= (1 << (c & 7)); + classbits[c / 8] |= (1 << (c & 7)); if ((options & PCRE_CASELESS) != 0) { int uc = cd->fcc[c]; /* flip case */ - class[uc / 8] |= (1 << (uc & 7)); + classbits[uc / 8] |= (1 << (uc & 7)); } class_charcount++; /* in case a one-char range */ class_lastchar = c; @@ -2839,18 +3832,20 @@ case status of the value. */ } /* Handle a lone single character - we can get here for a normal - non-escape char, or after \ that introduces a single character. */ + non-escape char, or after \ that introduces a single character or for an + apparent range that isn't. */ LONE_SINGLE_CHARACTER: - /* Handle a multibyte character */ + /* Handle a character that cannot go in the bit map */ + /* Handle a single-byte character */ { - class[c / 8] |= (1 << (c & 7)); + classbits[c / 8] |= (1 << (c & 7)); if ((options & PCRE_CASELESS) != 0) { c = cd->fcc[c]; /* flip case */ - class[c / 8] |= (1 << (c & 7)); + classbits[c / 8] |= (1 << (c & 7)); } class_charcount++; class_lastchar = c; @@ -2862,12 +3857,15 @@ case status of the value. */ while ((c = *(++ptr)) != ']' || inescq); - /* If class_charcount is 1, we saw precisely one character with a value < - 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and - the one character is < 128. In non-UTF-8 mode we can always optimize. + /* If class_charcount is 1, we saw precisely one character whose value is + less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we + can optimize the negative case only if there were no characters >= 128 + because OP_NOT and the related opcodes like OP_NOTSTAR operate on + single-bytes only. This is an historical hangover. Maybe one day we can + tidy these opcodes to handle multi-byte characters. The optimization throws away the bit map. We turn the item into a - 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note + 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note that OP_NOT does not support multibyte characters. In the positive case, it can cause firstbyte to be set. Otherwise, there can be no first char if this item is first, whatever repeat count may follow. In the case of @@ -2875,31 +3873,34 @@ case status of the value. */ if (class_charcount == 1) { zeroreqbyte = reqbyte; + + /* The OP_NOT opcode works on one-byte characters only. */ + if (negate_class) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; - *code++ = OP_NOT; - } else { - if (firstbyte == REQ_UNSET) { - zerofirstbyte = REQ_NONE; - firstbyte = class_lastchar | req_caseopt; - } else { - zerofirstbyte = firstbyte; - reqbyte = class_lastchar | req_caseopt | cd->req_varyopt; - } - *code++ = OP_CHARS; - *code++ = 1; + *code++ = OP_NOT; + *code++ = class_lastchar; + break; + } + + /* For a single, positive character, get the value into mcbuffer, and + then we can handle this with the normal one-character code. */ + + { + mcbuffer[0] = class_lastchar; + mclength = 1; } - *code++ = class_lastchar; - break; /* End of class handling */ + goto ONE_CHAR; } - /* End of 1-byte optimization */ - /* Otherwise, if this is the first thing in the branch, there can be no - first char setting, whatever the repeat count. Any reqbyte setting must - remain unchanged after any kind of repeat. */ + /* End of 1-char optimization */ + /* The general case - not the one-char optimization. If this is the first + thing in the branch, there can be no first char setting, whatever the + repeat count. Any reqbyte setting must remain unchanged after any kind of + repeat. */ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; @@ -2918,21 +3919,22 @@ case status of the value. */ if (negate_class) { *code++ = OP_NCLASS; for (c = 0; c < 32; c++) - code[c] = ~class[c]; + code[c] = ~classbits[c]; } else { *code++ = OP_CLASS; - memcpy(code, class, 32); + memcpy(code, classbits, 32); } code += 32; break; - /* Various kinds of repeat */ + /* Various kinds of repeat; '{' is not necessarily a quantifier, but this + has been tested above. */ case '{': - if (!is_counted_repeat(ptr + 1)) + if (!is_quantifier) goto NORMAL_CHAR; - ptr = read_repeat_counts(ptr + 1, &repeat_min, &repeat_max, errorptr); - if (*errorptr != NULL) + ptr = read_repeat_counts(ptr + 1, &repeat_min, &repeat_max, errorcodeptr); + if (*errorcodeptr != 0) goto FAILED; goto REPEAT; @@ -2952,7 +3954,7 @@ case status of the value. */ REPEAT: if (previous == NULL) { - *errorptr = ERR9; + *errorcodeptr = ERR9; goto FAILED; } @@ -3002,14 +4004,13 @@ case status of the value. */ code += 1 + LINK_SIZE; } - /* If previous was a string of characters, chop off the last one and use it - as the subject of the repeat. If there was only one character, we can - abolish the previous item altogether. If a one-char item has a minumum of - more than one, ensure that it is set in reqbyte - it might not be if a - sequence such as x{3} is the first thing in a branch because the x will - have gone into firstbyte instead. */ + /* If previous was a character match, abolish the item and generate a + repeat item instead. If a char item has a minumum of more than one, ensure + that it is set in reqbyte - it might not be if a sequence such as x{3} is + the first thing in a branch because the x will have gone into firstbyte + instead. */ - if (*previous == OP_CHARS) { + if (*previous == OP_CHAR || *previous == OP_CHARNC) { /* Deal with UTF-8 characters that take up more than one byte. It's easier to write this out separately than try to macrify it. Use c to hold the length of the character in bytes, plus 0x80 to flag that it's a @@ -3020,15 +4021,9 @@ case status of the value. */ with UTF-8 disabled, or for a UTF-8 character < 128. */ { - c = *(--code); - if (code == previous + 2) { /* There was only one character */ - code = previous; /* Abolish the previous item */ - if (repeat_min > 1) - reqbyte = c | req_caseopt | cd->req_varyopt; - } else { - previous[1]--; /* adjust length */ - tempcode = code; /* Adjust position to be moved for '+' */ - } + c = code[-1]; + if (repeat_min > 1) + reqbyte = c | req_caseopt | cd->req_varyopt; } goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ @@ -3042,20 +4037,28 @@ case status of the value. */ else if (*previous == OP_NOT) { op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ c = previous[1]; - code = previous; goto OUTPUT_SINGLE_REPEAT; } /* If previous was a character type match (\d or similar), abolish it and create a suitable repeat item. The code is shared with single-character - repeats by setting op_type to add a suitable offset into repeat_type. */ + repeats by setting op_type to add a suitable offset into repeat_type. Note + the the Unicode property types will be present only when SUPPORT_UCP is + defined, but we don't wrap the little bits of code here because it just + makes it horribly messy. */ else if (*previous < OP_EODN) { + uschar *oldcode; + int prop_type; op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ c = *previous; - code = previous; OUTPUT_SINGLE_REPEAT: + prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP) ? + previous[1] : -1; + + oldcode = code; + code = previous; /* Usually overwrite previous item */ /* If the maximum is zero then the minimum must also be zero; Perl allows this case, so we do too - by simply omitting the item altogether. */ @@ -3063,6 +4066,12 @@ case status of the value. */ if (repeat_max == 0) goto END_REPEAT; + /* All real repeats make it impossible to handle partial matching (maybe + one day we will be able to remove this restriction). */ + + if (repeat_max != 1) + cd->nopartial = TRUE; + /* Combine the op_type with the repeat_type */ repeat_type += op_type; @@ -3081,50 +4090,42 @@ case status of the value. */ } } - /* The case {1,} is handled as the special case + */ + /* A repeat minimum of 1 is optimized into some special cases. If the + maximum is unlimited, we use OP_PLUS. Otherwise, the original item it + left in place and, if the maximum is greater than 1, we use OP_UPTO with + one less than the maximum. */ - else if (repeat_min == 1 && repeat_max == -1) - *code++ = OP_PLUS + repeat_type; + else if (repeat_min == 1) { + if (repeat_max == -1) + *code++ = OP_PLUS + repeat_type; + else { + code = oldcode; /* leave previous item in place */ + if (repeat_max == 1) + goto END_REPEAT; + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max - 1); + } + } /* The case {n,n} is just an EXACT, while the general case {n,m} is - handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */ + handled as an EXACT followed by an UPTO. */ else { - if (repeat_min != 1) { - *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ - PUT2INC(code, 0, repeat_min); - } - - /* If the mininum is 1 and the previous item was a character string, - we either have to put back the item that got cancelled if the string - length was 1, or add the character back onto the end of a longer - string. For a character type nothing need be done; it will just get - put back naturally. Note that the final character is always going to - get added below, so we leave code ready for its insertion. */ - - else if (*previous == OP_CHARS) { - if (code == previous) - code += 2; - else - /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80 - bit set as a flag. The length will always be between 2 and 6. */ - - previous[1]++; - } - - /* For a single negated character we also have to put back the - item that got cancelled. At present this applies only to single byte - characters in any mode. */ - - else if (*previous == OP_NOT) - code++; + *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ + PUT2INC(code, 0, repeat_min); /* If the maximum is unlimited, insert an OP_STAR. Before doing so, - we have to insert the character for the previous code. In UTF-8 mode, - long characters have their length in c, with the 0x80 bit as a flag. */ + we have to insert the character for the previous code. For a repeated + Unicode property match, there is an extra byte that defines the + required property. In UTF-8 mode, long characters have their length in + c, with the 0x80 bit as a flag. */ if (repeat_max < 0) { - *code++ = c; + { + *code++ = c; + if (prop_type >= 0) + *code++ = prop_type; + } *code++ = OP_STAR + repeat_type; } @@ -3133,6 +4134,8 @@ case status of the value. */ else if (repeat_max != repeat_min) { *code++ = c; + if (prop_type >= 0) + *code++ = prop_type; repeat_max -= repeat_min; *code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); @@ -3141,8 +4144,15 @@ case status of the value. */ /* The character or character type itself comes last in all cases. */ - *code++ = c; + + /* For a repeated Unicode property match, there is an extra byte that + defines the required property. */ + +#ifdef SUPPORT_UCP + if (prop_type >= 0) + *code++ = prop_type; +#endif } /* If previous was a character class or a back reference, we put the repeat @@ -3154,6 +4164,13 @@ case status of the value. */ code = previous; goto END_REPEAT; } + + /* All real repeats make it impossible to handle partial matching (maybe + one day we will be able to remove this restriction). */ + + if (repeat_max != 1) + cd->nopartial = TRUE; + if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; else if (repeat_min == 1 && repeat_max == -1) @@ -3321,7 +4338,7 @@ case status of the value. */ /* Else there's some kind of shambles */ else { - *errorptr = ERR11; + *errorcodeptr = ERR11; goto FAILED; } @@ -3401,7 +4418,7 @@ case status of the value. */ while (*(++ptr) != ')') condref = condref * 10 + *ptr - '0'; if (condref == 0) { - *errorptr = ERR35; + *errorcodeptr = ERR35; goto FAILED; } ptr++; @@ -3442,17 +4459,22 @@ case status of the value. */ ptr++; break; - case 'C': /* Callout - may be followed by digits */ - *code++ = OP_CALLOUT; - { + case 'C': /* Callout - may be followed by digits; */ + previous_callout = code; /* Save for later completion */ + after_manual_callout = 1; /* Skip one item before completing */ + *code++ = OP_CALLOUT; /* Already checked that the terminating */ + { /* closing parenthesis is present. */ int n = 0; while ((digitab[*(++ptr)] & ctype_digit) != 0) n = n * 10 + *ptr - '0'; if (n > 255) { - *errorptr = ERR38; + *errorcodeptr = ERR38; goto FAILED; } *code++ = n; + PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ + code += 2 * LINK_SIZE; } previous = NULL; continue; @@ -3471,7 +4493,7 @@ case status of the value. */ int crc = memcmp(name, slot + 2, namelen); if (crc == 0) { if (slot[2 + namelen] == 0) { - *errorptr = ERR43; + *errorcodeptr = ERR43; goto FAILED; } crc = -1; /* Current name is substring */ @@ -3507,7 +4529,7 @@ case status of the value. */ slot += cd->name_entry_size; } if (i >= cd->names_found) { - *errorptr = ERR15; + *errorcodeptr = ERR15; goto FAILED; } @@ -3566,7 +4588,7 @@ case status of the value. */ cd->start_code : find_bracket(cd->start_code, utf8, recno); if (called == NULL) { - *errorptr = ERR15; + *errorcodeptr = ERR15; goto FAILED; } @@ -3576,7 +4598,7 @@ case status of the value. */ if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8)) { - *errorptr = ERR40; + *errorcodeptr = ERR40; goto FAILED; } @@ -3702,7 +4724,7 @@ case status of the value. */ brackets, /* Extracting bracket count */ &tempcode, /* Where to put code (updated) */ &ptr, /* Input pointer (updated) */ - errorptr, /* Where to put an error message */ + errorcodeptr, /* Where to put an error message */ (bravalue == OP_ASSERTBACK || bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ &subfirstbyte, /* For possible first char */ @@ -3730,7 +4752,7 @@ case status of the value. */ while (*tc != OP_KET); if (condcount > 2) { - *errorptr = ERR27; + *errorcodeptr = ERR27; goto FAILED; } @@ -3799,7 +4821,7 @@ case status of the value. */ /* Error if hit end of pattern */ if (*ptr != ')') { - *errorptr = ERR14; + *errorcodeptr = ERR14; goto FAILED; } break; @@ -3810,7 +4832,7 @@ case status of the value. */ case '\\': tempptr = ptr; - c = check_escape(&ptr, errorptr, *brackets, options, FALSE); + c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE); /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values are arranged to be the negation of the corresponding OP_values. For the @@ -3846,127 +4868,93 @@ case status of the value. */ previous = code; *code++ = OP_REF; PUT2INC(code, 0, number); - } else { - previous = (-c > ESC_b && -c < ESC_Z) ? code : NULL; - *code++ = -c; } - continue; - } - - /* Data character: reset and fall through */ - ptr = tempptr; - c = '\\'; - - /* Handle a run of data characters until a metacharacter is encountered. - The first character is guaranteed not to be whitespace or # when the - extended flag is set. */ - - NORMAL_CHAR: - default: - previous = code; - *code = OP_CHARS; - code += 2; - length = 0; - - do { - /* If in \Q...\E, check for the end; if not, we always have a literal */ + /* So are Unicode property matches, if supported. We know that get_ucp + won't fail because it was tested in the pre-pass. */ - if (inescq) { - if (c == '\\' && ptr[1] == 'E') { - inescq = FALSE; - ptr++; - } else { - *code++ = c; - length++; - } - continue; +#ifdef SUPPORT_UCP + else if (-c == ESC_P || -c == ESC_p) { + BOOL negated; + int value = get_ucp(&ptr, &negated, errorcodeptr); + previous = code; + *code++ = ((-c == ESC_p) != negated) ? OP_PROP : OP_NOTPROP; + *code++ = value; } +#endif - /* Skip white space and comments for /x patterns */ + /* For the rest, we can obtain the OP value by negating the escape + value */ - if ((options & PCRE_EXTENDED) != 0) { - if ((cd->ctypes[c] & ctype_space) != 0) - continue; - if (c == '#') { - /* The space before the ; is to avoid a warning on a silly compiler - on the Macintosh. */ - while ((c = *(++ptr)) != 0 && c != NEWLINE) ; - if (c == 0) - break; - continue; - } + else { + previous = (-c > ESC_b && -c < ESC_Z) ? code : NULL; + *code++ = -c; } + continue; + } - /* Backslash may introduce a data char or a metacharacter. Escaped items - are checked for validity in the pre-compiling pass. Stop the string - before a metaitem. */ + /* We have a data character whose value is in c. In UTF-8 mode it may have + a value > 127. We set its representation in the length/buffer, and then + handle it as a data character. */ - if (c == '\\') { - tempptr = ptr; - c = check_escape(&ptr, errorptr, *brackets, options, FALSE); - if (c < 0) { - ptr = tempptr; - break; - } - /* If a character is > 127 in UTF-8 mode, we have to turn it into - two or more bytes in the UTF-8 encoding. */ + { + mcbuffer[0] = c; + mclength = 1; + } - } + goto ONE_CHAR; - /* Ordinary character or single-char escape */ + /* Handle a literal character. It is guaranteed not to be whitespace or # + when the extended flag is set. If we are in UTF-8 mode, it may be a + multi-byte literal character. */ - *code++ = c; - length++; - } + default: + NORMAL_CHAR: + mclength = 1; + mcbuffer[0] = c; - /* This "while" is the end of the "do" above. */ - while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); + /* At this point we have the character's bytes in mcbuffer, and the length + in mclength. When not in UTF-8 mode, the length is always 1. */ - /* Update the first and last requirements. These are always bytes, even in - UTF-8 mode. However, there is a special case to be considered when there - are only one or two characters. Because this gets messy in UTF-8 mode, the - code is kept separate. When we get here "length" contains the number of - bytes. */ + ONE_CHAR: + previous = code; + *code++ = ((options & PCRE_CASELESS) != 0) ? OP_CHARNC : OP_CHAR; + for (c = 0; c < mclength; c++) + *code++ = mcbuffer[c]; + /* Set the first and required bytes appropriately. If no previous first + byte, set it from this character, but revert to none on a zero repeat. + Otherwise, leave the firstbyte value alone, and don't change it on a zero + repeat. */ - /* This is the code for non-UTF-8 operation, either without UTF-8 support, - or when UTF-8 is not enabled. */ + if (firstbyte == REQ_UNSET) { + zerofirstbyte = REQ_NONE; + zeroreqbyte = reqbyte; - { - /* firstbyte was not previously set; take it from this string */ + /* If the character is more than one byte long, we can set firstbyte + only if it is not to be matched caselessly. */ - if (firstbyte == REQ_UNSET) { - if (length == 1) { - zerofirstbyte = REQ_NONE; - firstbyte = previous[2] | req_caseopt; - zeroreqbyte = reqbyte; - } else { - zerofirstbyte = firstbyte = previous[2] | req_caseopt; - zeroreqbyte = (length > 2) ? - (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte; - reqbyte = code[-1] | req_caseopt | cd->req_varyopt; - } - } + if (mclength == 1 || req_caseopt == 0) { + firstbyte = mcbuffer[0] | req_caseopt; + if (mclength != 1) + reqbyte = code[-1] | cd->req_varyopt; + } else + firstbyte = reqbyte = REQ_NONE; + } - /* firstbyte was previously set */ + /* firstbyte was previously set; we can set reqbyte only the length is + 1 or the matching is caseful. */ - else { - zerofirstbyte = firstbyte; - zeroreqbyte = (length == 1) ? reqbyte : - code[-2] | req_caseopt | cd->req_varyopt; + else { + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + if (mclength == 1 || req_caseopt == 0) reqbyte = code[-1] | req_caseopt | cd->req_varyopt; - } } - /* Set the length in the data vector, and advance to the next state. */ - - previous[1] = length; - if (length < MAXLIT) - ptr--; - break; + break; /* End of literal character handling */ } } /* end of big loop */ @@ -4000,7 +4988,7 @@ Argument: brackets -> int containing the number of extracting brackets used codeptr -> the address of the current code pointer ptrptr -> the address of the current pattern pointer - errorptr -> pointer to error message + errorcodeptr -> pointer to error code variable lookbehind TRUE if this is a lookbehind assertion skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER) firstbyteptr place to put the first required character, or a negative number @@ -4013,7 +5001,7 @@ Returns: TRUE on success static BOOL compile_regex(int options, int oldims, int *brackets, uschar ** codeptr, - const uschar ** ptrptr, const char **errorptr, BOOL lookbehind, + const uschar ** ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr, int *reqbyteptr, branch_chain * bcptr, compile_data * cd) { @@ -4056,7 +5044,7 @@ compile_regex(int options, int oldims, int *brackets, uschar ** codeptr, /* Now compile the branch */ - if (!compile_branch(&options, brackets, &code, &ptr, errorptr, + if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr, &branchfirstbyte, &branchreqbyte, &bc, cd)) { *ptrptr = ptr; return FALSE; @@ -4108,9 +5096,8 @@ compile_regex(int options, int oldims, int *brackets, uschar ** codeptr, int length; *code = OP_END; length = find_fixedlength(last_branch, options); - DPRINTF(("fixed length = %d\n", length)); if (length < 0) { - *errorptr = (length == -2) ? ERR36 : ERR25; + *errorcodeptr = (length == -2) ? ERR36 : ERR25; *ptrptr = ptr; return FALSE; } @@ -4219,7 +5206,8 @@ is_anchored(register const uschar * code, int *options, { do { const uschar *scode = - first_significant_code(code + 1 + LINK_SIZE, options, PCRE_MULTILINE); + first_significant_code(code + 1 + LINK_SIZE, options, PCRE_MULTILINE, + FALSE); register int op = *scode; /* Capturing brackets */ @@ -4289,7 +5277,8 @@ is_startline(const uschar * code, unsigned int bracket_map, unsigned int backref_map) { do { - const uschar *scode = first_significant_code(code + 1 + LINK_SIZE, NULL, 0); + const uschar *scode = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, + FALSE); register int op = *scode; /* Capturing brackets */ @@ -4311,7 +5300,7 @@ is_startline(const uschar * code, unsigned int bracket_map, return FALSE; } - /* .* is not anchored unless DOTALL is set and it isn't in brackets that + /* .* means "start at start or after \n" if it isn't in brackets that may be referenced. */ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) { @@ -4323,6 +5312,9 @@ is_startline(const uschar * code, unsigned int bracket_map, else if (op != OP_CIRC) return FALSE; + + /* Move on to the next alternative */ + code += GET(code, 1); } while (*code == OP_ALT); /* Loop for each alternative */ @@ -4358,7 +5350,8 @@ find_firstassertedchar(const uschar * code, int *options, BOOL inassert) do { int d; const uschar *scode = - first_significant_code(code + 1 + LINK_SIZE, options, PCRE_CASELESS); + first_significant_code(code + 1 + LINK_SIZE, options, PCRE_CASELESS, + TRUE); register int op = *scode; if (op >= OP_BRA) @@ -4381,11 +5374,10 @@ find_firstassertedchar(const uschar * code, int *options, BOOL inassert) break; case OP_EXACT: /* Fall through */ - scode++; - - case OP_CHARS: /* Fall through */ - scode++; + scode += 2; + case OP_CHAR: + case OP_CHARNC: case OP_PLUS: case OP_MINPLUS: if (!inassert) @@ -4406,9 +5398,8 @@ find_firstassertedchar(const uschar * code, int *options, BOOL inassert) } - - - +pcre *pcre_compile2(const char *, int, int *, const char **, + int *, const unsigned char *); /************************************************* @@ -4416,26 +5407,38 @@ find_firstassertedchar(const uschar * code, int *options, BOOL inassert) *************************************************/ /* This function takes a string and returns a pointer to a block of store -holding a compiled version of the expression. +holding a compiled version of the expression. The original API for this +function had no error code return variable; it is retained for backwards +compatibility. The new function is given a new name. Arguments: - pattern the regular expression - options various option bits - errorptr pointer to pointer to error text - erroroffset ptr offset in pattern where error was detected - tables pointer to character tables or NULL - -Returns: pointer to compiled data block, or NULL on error, - with errorptr and erroroffset set + pattern the regular expression + options various option bits + errorcodeptr pointer to error code variable (pcre_compile2() only) + can be NULL if you don't want a code value + errorptr pointer to pointer to error text + erroroffset ptr offset in pattern where error was detected + tables pointer to character tables or NULL + +Returns: pointer to compiled data block, or NULL on error, + with errorptr and erroroffset set */ -EXPORT pcre * +pcre * pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) +{ + return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +} + + +pcre * +pcre_compile2(const char *pattern, int options, int *errorcodeptr, + const char **errorptr, int *erroroffset, + const unsigned char *tables) { real_pcre *re; int length = 1 + LINK_SIZE; /* For initial BRA plus length */ - int runlength; int c, firstbyte, reqbyte; int bracount = 0; int branch_extra = 0; @@ -4443,7 +5446,10 @@ pcre_compile(const char *pattern, int options, const char **errorptr, int item_count = -1; int name_count = 0; int max_name_size = 0; + int lastitemlength = 0; + int errorcode = 0; BOOL inescq = FALSE; + BOOL capturing; unsigned int brastackptr = 0; size_t size; uschar *code; @@ -4454,36 +5460,44 @@ pcre_compile(const char *pattern, int options, const char **errorptr, uschar bralenstack[BRASTACK_SIZE]; /* We can't pass back an error message if errorptr is NULL; I guess the best we -can do is just return NULL. */ +can do is just return NULL, but we can set a code value if there is a code +pointer. */ - if (errorptr == NULL) + if (errorptr == NULL) { + if (errorcodeptr != NULL) + *errorcodeptr = 99; return NULL; + } + *errorptr = NULL; + if (errorcodeptr != NULL) + *errorcodeptr = ERR0; /* However, we can give a message for this error */ if (erroroffset == NULL) { - *errorptr = ERR16; - return NULL; + errorcode = ERR16; + goto PCRE_EARLY_ERROR_RETURN; } + *erroroffset = 0; /* Can't support UTF8 unless PCRE has been compiled to include the code. */ if ((options & PCRE_UTF8) != 0) { - *errorptr = ERR32; - return NULL; + errorcode = ERR32; + goto PCRE_EARLY_ERROR_RETURN; } if ((options & ~PUBLIC_OPTIONS) != 0) { - *errorptr = ERR17; - return NULL; + errorcode = ERR17; + goto PCRE_EARLY_ERROR_RETURN; } /* Set up pointers to the individual character tables */ if (tables == NULL) - tables = pcre_default_tables; + tables = _pcre_default_tables; compile_block.lcc = tables + lcc_offset; compile_block.fcc = tables + fcc_offset; compile_block.cbits = tables + cbits_offset; @@ -4499,9 +5513,6 @@ whether (.*) can be treated as anchored or not. */ /* Reflect pattern for debugging output */ - DPRINTF(("------------------------------------------------------------------\n")); - DPRINTF(("%s\n", pattern)); - /* The first thing to do is to make a pass over the pattern to compute the amount of store required to hold the compiled code. This does not have to be perfect as long as errors are overestimates. At the same time we can detect any @@ -4518,8 +5529,11 @@ pattern. We can't be so clever for #-comments. */ /* If we are inside a \Q...\E sequence, all chars are literal */ - if (inescq) + if (inescq) { + if ((options & PCRE_AUTO_CALLOUT) != 0) + length += 2 + 2 * LINK_SIZE; goto NORMAL_CHAR; + } /* Otherwise, first check for ignored whitespace and comments */ @@ -4538,23 +5552,29 @@ pattern. We can't be so clever for #-comments. */ item_count++; /* Is zero for the first non-comment item */ + /* Allow space for auto callout before every item except quantifiers. */ + + if ((options & PCRE_AUTO_CALLOUT) != 0 && + c != '*' && c != '+' && c != '?' && + (c != '{' || !is_counted_repeat(ptr + 1))) + length += 2 + 2 * LINK_SIZE; + switch (c) { - /* A backslashed item may be an escaped "normal" character or a - character type. For a "normal" character, put the pointers and - character back so that tests for whitespace etc. in the input - are done correctly. */ + /* A backslashed item may be an escaped data character or it may be a + character type. */ case '\\': - { - const uschar *save_ptr = ptr; - c = check_escape(&ptr, errorptr, bracount, options, FALSE); - if (*errorptr != NULL) - goto PCRE_ERROR_RETURN; - if (c >= 0) { - ptr = save_ptr; - c = '\\'; - goto NORMAL_CHAR; - } + c = check_escape(&ptr, &errorcode, bracount, options, FALSE); + if (errorcode != 0) + goto PCRE_ERROR_RETURN; + + lastitemlength = 1; /* Default length of last item for repeats */ + + if (c >= 0) { /* Data character */ + length += 2; /* For a one-byte character */ + + + continue; } /* If \Q, enter "literal" mode */ @@ -4564,7 +5584,33 @@ pattern. We can't be so clever for #-comments. */ continue; } - /* Other escapes need one byte, and are of length one for repeats */ + /* \X is supported only if Unicode property support is compiled */ + +#ifndef SUPPORT_UCP + if (-c == ESC_X) { + errorcode = ERR45; + goto PCRE_ERROR_RETURN; + } +#endif + + /* \P and \p are for Unicode properties, but only when the support has + been compiled. Each item needs 2 bytes. */ + + else if (-c == ESC_P || -c == ESC_p) { +#ifdef SUPPORT_UCP + BOOL negated; + length += 2; + lastitemlength = 2; + if (get_ucp(&ptr, &negated, &errorcode) < 0) + goto PCRE_ERROR_RETURN; + continue; +#else + errorcode = ERR45; + goto PCRE_ERROR_RETURN; +#endif + } + + /* Other escapes need one byte */ length++; @@ -4579,8 +5625,8 @@ pattern. We can't be so clever for #-comments. */ compile_block.top_backref = refnum; length += 2; /* For single back reference */ if (ptr[1] == '{' && is_counted_repeat(ptr + 2)) { - ptr = read_repeat_counts(ptr + 2, &min, &max, errorptr); - if (*errorptr != NULL) + ptr = read_repeat_counts(ptr + 2, &min, &max, &errorcode); + if (errorcode != 0) goto PCRE_ERROR_RETURN; if ((min == 0 && (max == 1 || max == -1)) || (min == 1 && max == -1)) length++; @@ -4596,6 +5642,7 @@ pattern. We can't be so clever for #-comments. */ case '.': case '$': length++; + lastitemlength = 1; continue; case '*': /* These repeats won't be after brackets; */ @@ -4610,8 +5657,8 @@ pattern. We can't be so clever for #-comments. */ case '{': if (!is_counted_repeat(ptr + 1)) goto NORMAL_CHAR; - ptr = read_repeat_counts(ptr + 1, &min, &max, errorptr); - if (*errorptr != NULL) + ptr = read_repeat_counts(ptr + 1, &min, &max, &errorcode); + if (errorcode != 0) goto PCRE_ERROR_RETURN; /* These special cases just insert one extra opcode */ @@ -4622,17 +5669,12 @@ pattern. We can't be so clever for #-comments. */ /* These cases might insert additional copies of a preceding character. */ else { - - /* Not UTF-8 mode: all characters are one byte */ - { - if (min != 1) { - length--; /* Uncount the original char or metachar */ - if (min > 0) - length += 4; - } - - length += (max > 0) ? 4 : 2; + if (min != 1) { + length -= lastitemlength; /* Uncount the original char or metachar */ + if (min > 0) + length += 3 + lastitemlength; } + length += lastitemlength + ((max > 0) ? 3 : 1); } if (ptr[1] == '?') @@ -4663,11 +5705,12 @@ pattern. We can't be so clever for #-comments. */ where we can. (In UTF-8 mode we can do this only for chars < 128.) */ case '[': - class_optcount = 0; - - - if (*(++ptr) == '^') + if (*(++ptr) == '^') { + class_optcount = 10; /* Greater than one */ ptr++; + } else + class_optcount = 0; + /* Written as a "do" so that an initial ']' is taken as data */ @@ -4677,7 +5720,7 @@ pattern. We can't be so clever for #-comments. */ if (inescq) { if (*ptr != '\\' || ptr[1] != 'E') - goto NON_SPECIAL_CHARACTER; + goto GET_ONE_CHARACTER; inescq = FALSE; ptr += 1; continue; @@ -4686,28 +5729,35 @@ pattern. We can't be so clever for #-comments. */ /* Outside \Q...\E, check for escapes */ if (*ptr == '\\') { - int ch = check_escape(&ptr, errorptr, bracount, options, TRUE); - if (*errorptr != NULL) + c = check_escape(&ptr, &errorcode, bracount, options, TRUE); + if (errorcode != 0) goto PCRE_ERROR_RETURN; - /* \b is backspace inside a class */ + /* \b is backspace inside a class; \X is literal */ - if (-ch == ESC_b) - ch = '\b'; + if (-c == ESC_b) + c = '\b'; + else if (-c == ESC_X) + c = 'X'; /* \Q enters quoting mode */ - if (-ch == ESC_Q) { + else if (-c == ESC_Q) { inescq = TRUE; continue; } /* Handle escapes that turn into characters */ - if (ch >= 0) { - class_optcount++; /* for possible optimization */ - } else + if (c >= 0) + goto NON_SPECIAL_CHARACTER; + + /* Escapes that are meta-things. The normal ones just affect the + bit map, but Unicode properties require an XCLASS extended item. */ + + else { class_optcount = 10; /* \d, \s etc; make sure > 1 */ + } } /* Check the syntax for POSIX stuff. The bits we actually handle are @@ -4718,19 +5768,70 @@ pattern. We can't be so clever for #-comments. */ class_optcount = 10; /* Make sure > 1 */ } - /* Anything else just increments the possible optimization count. If - there are wide characters, we are going to have to use an XCLASS. */ + /* Anything else increments the possible optimization count. We have to + detect ranges here so that we can compute the number of extra ranges for + caseless wide characters when UCP support is available. If there are wide + characters, we are going to have to use an XCLASS, even for single + characters. */ else { + int d; + + GET_ONE_CHARACTER: + + c = *ptr; + + /* Come here from handling \ above when it escapes to a char value */ + NON_SPECIAL_CHARACTER: class_optcount++; + d = -1; + if (ptr[1] == '-') { + uschar const *hyptr = ptr++; + if (ptr[1] == '\\') { + ptr++; + d = check_escape(&ptr, &errorcode, bracount, options, TRUE); + if (errorcode != 0) + goto PCRE_ERROR_RETURN; + if (-d == ESC_b) + d = '\b'; /* backspace */ + else if (-d == ESC_X) + d = 'X'; /* literal X in a class */ + } else if (ptr[1] != 0 && ptr[1] != ']') { + ptr++; + d = *ptr; + } + if (d < 0) + ptr = hyptr; /* go back to hyphen as data */ + } + + /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > + 127 for caseless matching, we will need to use an XCLASS. */ + + if (d >= 0) { + class_optcount = 10; /* Ensure > 1 */ + if (d < c) { + errorcode = ERR8; + goto PCRE_ERROR_RETURN; + } + + + } + + /* We have a single character. There is nothing to be done unless we + are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must + allow for an XCL_SINGLE item, doubled for caselessness if there is UCP + support. */ + + else { + } } } while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */ if (*ptr == 0) { /* Missing terminating ']' */ - *errorptr = ERR6; + errorcode = ERR6; goto PCRE_ERROR_RETURN; } @@ -4747,8 +5848,8 @@ pattern. We can't be so clever for #-comments. */ we also need extra for wrapping the whole thing in a sub-pattern. */ if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr + 2)) { - ptr = read_repeat_counts(ptr + 2, &min, &max, errorptr); - if (*errorptr != NULL) + ptr = read_repeat_counts(ptr + 2, &min, &max, &errorcode); + if (errorcode != 0) goto PCRE_ERROR_RETURN; if ((min == 0 && (max == 1 || max == -1)) || (min == 1 && max == -1)) length++; @@ -4768,6 +5869,7 @@ pattern. We can't be so clever for #-comments. */ case '(': branch_newextra = 0; bracket_length = 1 + LINK_SIZE; + capturing = FALSE; /* Handle special forms of bracket, which all start (? */ @@ -4782,7 +5884,7 @@ pattern. We can't be so clever for #-comments. */ while (*ptr != 0 && *ptr != ')') ptr++; if (*ptr == 0) { - *errorptr = ERR18; + errorcode = ERR18; goto PCRE_ERROR_RETURN; } continue; @@ -4824,7 +5926,7 @@ pattern. We can't be so clever for #-comments. */ if (c != 'R') while ((digitab[*(++ptr)] & ctype_digit) != 0) ; if (*ptr != ')') { - *errorptr = ERR29; + errorcode = ERR29; goto PCRE_ERROR_RETURN; } length += 1 + LINK_SIZE; @@ -4848,35 +5950,41 @@ pattern. We can't be so clever for #-comments. */ ptr += 2; while ((digitab[*(++ptr)] & ctype_digit) != 0) ; if (*ptr != ')') { - *errorptr = ERR39; + errorcode = ERR39; goto PCRE_ERROR_RETURN; } - length += 2; + length += 2 + 2 * LINK_SIZE; continue; /* Named subpatterns are an extension copied from Python */ case 'P': ptr += 3; + + /* Handle the definition of a named subpattern */ + if (*ptr == '<') { const uschar *p; /* Don't amalgamate; some compilers */ p = ++ptr; /* grumble at autoincrement in declaration */ while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; if (*ptr != '>') { - *errorptr = ERR42; + errorcode = ERR42; goto PCRE_ERROR_RETURN; } name_count++; if (ptr - p > max_name_size) max_name_size = (ptr - p); + capturing = TRUE; /* Named parentheses are always capturing */ break; } + /* Handle back references and recursive calls to named subpatterns */ + if (*ptr == '=' || *ptr == '>') { while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0) ; if (*ptr != ')') { - *errorptr = ERR42; + errorcode = ERR42; goto PCRE_ERROR_RETURN; } break; @@ -4884,7 +5992,7 @@ pattern. We can't be so clever for #-comments. */ /* Unknown character after (?P */ - *errorptr = ERR41; + errorcode = ERR41; goto PCRE_ERROR_RETURN; /* Lookbehinds are in Perl from version 5.005 */ @@ -4896,7 +6004,7 @@ pattern. We can't be so clever for #-comments. */ length += 1 + LINK_SIZE; /* For the first branch */ break; } - *errorptr = ERR24; + errorcode = ERR24; goto PCRE_ERROR_RETURN; /* Conditionals are in Perl from version 5.005. The bracket must either @@ -4913,7 +6021,7 @@ pattern. We can't be so clever for #-comments. */ while ((digitab[*ptr] & ctype_digit) != 0) ptr++; if (*ptr != ')') { - *errorptr = ERR26; + errorcode = ERR26; goto PCRE_ERROR_RETURN; } } else { /* An assertion must follow */ @@ -4922,7 +6030,7 @@ pattern. We can't be so clever for #-comments. */ if (ptr[2] != '?' || (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<')) { ptr += 2; /* To get right offset in message */ - *errorptr = ERR28; + errorcode = ERR28; goto PCRE_ERROR_RETURN; } } @@ -4976,6 +6084,14 @@ pattern. We can't be so clever for #-comments. */ nothing is done here and it is handled during the compiling process. + We allow for more than one options setting at the start. If such + settings do not change the existing options, nothing is compiled. + However, we must leave space just in case something is compiled. + This can happen for pathological sequences such as (?i)(?-i) + because the global options will end up with -i set. The space is + small and not significant. (Before I did this there was a reported + bug with (?i)(?-i) in a machine-generated pattern.) + [Historical note: Up to Perl 5.8, options settings at top level were always global settings, wherever they appeared in the pattern. That is, they were equivalent to an external setting. From 5.8 @@ -4987,6 +6103,7 @@ pattern. We can't be so clever for #-comments. */ options = (options | set) & (~unset); set = unset = 0; /* To save length */ item_count--; /* To allow for several */ + length += 2; } /* Fall through */ @@ -5016,7 +6133,7 @@ pattern. We can't be so clever for #-comments. */ /* Unrecognized option character */ default: - *errorptr = ERR12; + errorcode = ERR12; goto PCRE_ERROR_RETURN; } } @@ -5035,18 +6152,25 @@ pattern. We can't be so clever for #-comments. */ continue; } - /* If options were terminated by ':' control comes here. Fall through - to handle the group below. */ + /* If options were terminated by ':' control comes here. This is a + non-capturing group with an options change. There is nothing more that + needs to be done because "capturing" is already set FALSE by default; + we can just fall through. */ + } } - /* Extracting brackets must be counted so we can process escapes in a - Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to - need an additional 3 bytes of store per extracting bracket. However, if - PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we - must leave the count alone (it will aways be zero). */ + /* Ordinary parentheses, not followed by '?', are capturing unless + PCRE_NO_AUTO_CAPTURE is set. */ + + else + capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0; + + /* Capturing brackets must be counted so we can process escapes in a + Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need + an additional 3 bytes of memory per capturing bracket. */ - else if ((options & PCRE_NO_AUTO_CAPTURE) == 0) { + if (capturing) { bracount++; if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; @@ -5058,7 +6182,7 @@ pattern. We can't be so clever for #-comments. */ will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ if (brastackptr >= sizeof(brastack) / sizeof(int)) { - *errorptr = ERR19; + errorcode = ERR19; goto PCRE_ERROR_RETURN; } @@ -5094,8 +6218,8 @@ pattern. We can't be so clever for #-comments. */ automatically; for the others we need an increment. */ if ((c = ptr[1]) == '{' && is_counted_repeat(ptr + 2)) { - ptr = read_repeat_counts(ptr + 2, &min, &max, errorptr); - if (*errorptr != NULL) + ptr = read_repeat_counts(ptr + 2, &min, &max, &errorcode); + if (errorcode != 0) goto PCRE_ERROR_RETURN; } else if (c == '*') { min = 0; @@ -5146,90 +6270,37 @@ pattern. We can't be so clever for #-comments. */ } continue; - /* Non-special character. For a run of such characters the length required - is the number of characters + 2, except that the maximum run length is - MAXLIT. We won't get a skipped space or a non-data escape or the start of a - # comment as the first character, so the length can't be zero. */ + /* Non-special character. It won't be space or # in extended mode, so it is + always a genuine character. If we are in a \Q...\E sequence, check for the + end; if not, we have a literal. */ - NORMAL_CHAR: default: - length += 2; - runlength = 0; - do { - - /* If in a \Q...\E sequence, check for end; otherwise it's a literal */ - if (inescq) { - if (c == '\\' && ptr[1] == 'E') { - inescq = FALSE; - ptr++; - } else - runlength++; - continue; - } - - /* Skip whitespace and comments for /x */ - - if ((options & PCRE_EXTENDED) != 0) { - if ((compile_block.ctypes[c] & ctype_space) != 0) - continue; - if (c == '#') { - /* The space before the ; is to avoid a warning on a silly compiler - on the Macintosh. */ - while ((c = *(++ptr)) != 0 && c != NEWLINE) ; - continue; - } - } - - /* Backslash may introduce a data char or a metacharacter; stop the - string before the latter. */ - - if (c == '\\') { - const uschar *saveptr = ptr; - c = check_escape(&ptr, errorptr, bracount, options, FALSE); - if (*errorptr != NULL) - goto PCRE_ERROR_RETURN; - if (c < 0) { - ptr = saveptr; - break; - } - - /* In UTF-8 mode, add on the number of additional bytes needed to - encode this character, and save the total length in case this is a - final char that is repeated. */ - - } - - /* Ordinary character or single-char escape */ + NORMAL_CHAR: - runlength++; + if (inescq && c == '\\' && ptr[1] == 'E') { + inescq = FALSE; + ptr++; + continue; } - /* This "while" is the end of the "do" above. */ - - while (runlength < MAXLIT && - (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); - - /* If we hit a meta-character, back off to point to it */ + length += 2; /* For a one-byte character */ + lastitemlength = 1; /* Default length of last item for repeats */ - if (runlength < MAXLIT) - ptr--; - - /* If the last char in the string is a UTF-8 multibyte character, we must - set lastcharlength correctly. If it was specified as an escape, this will - already have been done above. However, we also have to support in-line - UTF-8 characters, so check backwards from where we are. */ + /* In UTF-8 mode, check for additional bytes. */ - length += runlength; continue; } } length += 2 + LINK_SIZE; /* For final KET and END */ + if ((options & PCRE_AUTO_CALLOUT) != 0) + length += 2 + 2 * LINK_SIZE; /* For final callout */ + if (length > MAX_PATTERN_SIZE) { - *errorptr = ERR20; - return NULL; + errorcode = ERR20; + goto PCRE_EARLY_ERROR_RETURN; } /* Compute the size of data block needed and get it, either from malloc or @@ -5239,31 +6310,40 @@ externally provided function. */ re = (real_pcre *) malloc(size); if (re == NULL) { - *errorptr = ERR21; - return NULL; + errorcode = ERR21; + goto PCRE_EARLY_ERROR_RETURN; } -/* Put in the magic number, and save the size, options, and table pointer */ +/* Put in the magic number, and save the sizes, options, and character table +pointer. NULL is used for the default character tables. The nullpad field is at +the end; it's there to help in the case when a regex compiled on a system with +4-byte pointers is run on another with 8-byte pointers. */ re->magic_number = MAGIC_NUMBER; re->size = size; re->options = options; - re->tables = tables; + re->dummy1 = 0; + re->name_table_offset = sizeof(real_pcre); re->name_entry_size = max_name_size + 3; re->name_count = name_count; + re->ref_count = 0; + re->tables = (tables == _pcre_default_tables) ? NULL : tables; + re->nullpad = NULL; /* The starting points of the name/number translation table and of the code are passed around in the compile data block. */ compile_block.names_found = 0; compile_block.name_entry_size = max_name_size + 3; - compile_block.name_table = (uschar *) re + sizeof(real_pcre); + compile_block.name_table = (uschar *) re + re->name_table_offset; codestart = compile_block.name_table + re->name_entry_size * re->name_count; compile_block.start_code = codestart; + compile_block.start_pattern = (const uschar *) pattern; compile_block.req_varyopt = 0; + compile_block.nopartial = FALSE; /* Set up a starting, non-extracting bracket, then compile the expression. On -error, *errorptr will be set non-NULL, so we don't need to look at the result +error, errorcode will be set non-zero, so we don't need to look at the result of the function here. */ ptr = (const uschar *) pattern; @@ -5271,15 +6351,18 @@ of the function here. */ *code = OP_BRA; bracount = 0; (void) compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, - errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, + &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block); re->top_bracket = bracount; re->top_backref = compile_block.top_backref; + if (compile_block.nopartial) + re->options |= PCRE_NOPARTIAL; + /* If not reached end of pattern on success, there's an excess bracket. */ - if (*errorptr == NULL && *ptr != 0) - *errorptr = ERR22; + if (errorcode == 0 && *ptr != 0) + errorcode = ERR22; /* Fill in the terminating state and check for disastrous overflow, but if debugging, leave the test till after things are printed out. */ @@ -5287,20 +6370,24 @@ if debugging, leave the test till after things are printed out. */ *code++ = OP_END; if (code - codestart > length) - *errorptr = ERR23; + errorcode = ERR23; /* Give an error if there's back reference to a non-existent capturing subpattern. */ if (re->top_backref > re->top_bracket) - *errorptr = ERR15; + errorcode = ERR15; /* Failed to compile, or error while post-processing */ - if (*errorptr != NULL) { + if (errorcode != 0) { free(re); PCRE_ERROR_RETURN: *erroroffset = ptr - (const uschar *) pattern; + PCRE_EARLY_ERROR_RETURN: + *errorptr = error_texts[errorcode]; + if (errorcodeptr != NULL) + *errorcodeptr = errorcode; return NULL; } @@ -5333,7 +6420,7 @@ start with ^. and also when all branches start with .* for non-DOTALL matches. /* For an anchored pattern, we use the "required byte" only if it follows a variable length item in the regex. Remove the caseless flag for non-caseable -chars. */ +bytes. */ if (reqbyte >= 0 && ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) { @@ -5344,11 +6431,59 @@ chars. */ re->options |= PCRE_REQCHSET; } -/* Print out the compiled data for debugging */ +/* Print out the compiled data if debugging is enabled. This is never the +case when building a production library. */ + + + return (pcre *) re; +} + +/* End of pcre_compile.c */ + +/* pcre_exec.c */ + + +/* This module contains pcre_exec(), the externally visible function that does +pattern matching using an NFA algorithm, trying to mimic Perl as closely as +possible. There are also some static supporting functions. */ + + + +/* Structure for building a chain of data that actually lives on the +stack, for holding the values of the subject pointer at the start of each +subpattern, so as to detect when an empty string has been matched by a +subpattern - to break infinite loops. When NO_RECURSE is set, these blocks +are on the heap, not on the stack. */ + +typedef struct eptrblock { + struct eptrblock *epb_prev; + const uschar *epb_saved_eptr; +} eptrblock; + +/* Flag bits for the match() function */ + +#define match_condassert 0x01 /* Called to check a condition assertion */ +#define match_isgroup 0x02 /* Set if start of bracketed group */ + +/* Non-error returns from the match() function. Error returns are externally +defined PCRE_ERROR_xxx codes, which are all negative. */ + +#define MATCH_MATCH 1 +#define MATCH_NOMATCH 0 + +/* Maximum number of ints of offset to save on the stack for recursive calls. +If the offset vector is bigger, malloc is used. This should be a multiple of 3, +because the offset vector is always a multiple of 3 long. */ + +#define REC_STACK_SAVE_MAX 30 + +/* Min and max values for the common repeats; for the maxima, 0 => infinity */ + +static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; + - return (pcre *) re; -} @@ -5398,7 +6533,6 @@ match_ref(int offset, register const uschar * eptr, int length, match_data * md, - /*************************************************************************** **************************************************************************** RECURSION IN THE match() FUNCTION @@ -5434,13 +6568,13 @@ always used to. /* These versions of the macros manage a private stack on the heap. Note that the rd argument of RMATCH isn't actually used. It's the md argument of -match(), which never actually changes. */ +match(), which never changes. */ #define REGISTER #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\ {\ - heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ + heapframe *newframe = malloc(sizeof(heapframe));\ if (setjmp(frame->Xwhere) == 0)\ {\ newframe->Xeptr = ra;\ @@ -5451,12 +6585,10 @@ match(), which never actually changes. */ newframe->Xflags = rg;\ newframe->Xprevframe = frame;\ frame = newframe;\ - DPRINTF(("restarting from line %d\n", __LINE__));\ goto HEAP_RECURSE;\ }\ else\ {\ - DPRINTF(("longjumped back to line %d\n", __LINE__));\ frame = md->thisframe;\ rx = frame->Xresult;\ }\ @@ -5466,7 +6598,7 @@ match(), which never actually changes. */ {\ heapframe *newframe = frame;\ frame = newframe->Xprevframe;\ - (pcre_stack_free)(newframe);\ + free(newframe);\ if (frame != NULL)\ {\ frame->Xresult = ra;\ @@ -5496,7 +6628,6 @@ typedef struct heapframe { const uschar *Xcallpat; const uschar *Xcharptr; const uschar *Xdata; - const uschar *Xlastptr; const uschar *Xnext; const uschar *Xpp; const uschar *Xprev; @@ -5511,6 +6642,16 @@ typedef struct heapframe { unsigned long int Xoriginal_ims; +#ifdef SUPPORT_UCP + int Xprop_type; + int Xprop_fail_result; + int Xprop_category; + int Xprop_chartype; + int Xprop_othercase; + int Xprop_test_against; + int *Xprop_test_variable; +#endif + int Xctype; int Xfc; int Xfi; @@ -5587,6 +6728,7 @@ because they are used a lot in loops. */ register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ register int c; /* Character values not kept over RMATCH() calls */ + register BOOL utf8; /* Local copy of UTF-8 flag for speed */ /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from @@ -5594,7 +6736,7 @@ heap storage. Set up the top-level frame here; others are obtained from the heap whenever RMATCH() does a "recursion". See the macro definitions above. */ #ifdef NO_RECURSE - heapframe *frame = (pcre_stack_malloc) (sizeof(heapframe)); + heapframe *frame = malloc(sizeof(heapframe)); frame->Xprevframe = NULL; /* Marks the top level */ /* Copy in the original argument variables */ @@ -5622,9 +6764,7 @@ HEAP_RECURSE: /* Ditto for the local variables */ #define callpat frame->Xcallpat -#define charptr frame->Xcharptr #define data frame->Xdata -#define lastptr frame->Xlastptr #define next frame->Xnext #define pp frame->Xpp #define prev frame->Xprev @@ -5639,6 +6779,16 @@ HEAP_RECURSE: #define original_ims frame->Xoriginal_ims +#ifdef SUPPORT_UCP +#define prop_type frame->Xprop_type +#define prop_fail_result frame->Xprop_fail_result +#define prop_category frame->Xprop_category +#define prop_chartype frame->Xprop_chartype +#define prop_othercase frame->Xprop_othercase +#define prop_test_against frame->Xprop_test_against +#define prop_test_variable frame->Xprop_test_variable +#endif + #define ctype frame->Xctype #define fc frame->Xfc #define fi frame->Xfi @@ -5664,24 +6814,33 @@ i, and fc and c, can be the same variables. */ #define fi i #define fc c - const uschar *callpat; /* Many of these variables are used ony */ - const uschar *charptr; /* small blocks of the code. My normal */ - const uschar *data; /* style of coding would have declared */ - const uschar *lastptr; /* them within each of those blocks. */ - const uschar *next; /* However, in order to accommodate the */ - const uschar *pp; /* version of this code that uses an */ - const uschar *prev; /* external "stack" implemented on the */ - const uschar *saved_eptr; /* heap, it is easier to declare them */ - /* all here, so the declarations can */ - recursion_info new_recursive; /* be cut out in a block. The only */ - /* declarations within blocks below are */ - BOOL cur_is_word; /* for variables that do not have to */ - BOOL condition; /* be preserved over a recursive call */ - BOOL minimize; /* to RMATCH(). */ + + const uschar *callpat; /* them within each of those blocks. */ + const uschar *data; /* However, in order to accommodate the */ + const uschar *next; /* version of this code that uses an */ + const uschar *pp; /* external "stack" implemented on the */ + const uschar *prev; /* heap, it is easier to declare them */ + const uschar *saved_eptr; /* all here, so the declarations can */ + /* be cut out in a block. The only */ + recursion_info new_recursive; /* declarations within blocks below are */ + /* for variables that do not have to */ + BOOL cur_is_word; /* be preserved over a recursive call */ + BOOL condition; /* to RMATCH(). */ + BOOL minimize; BOOL prev_is_word; unsigned long int original_ims; +#ifdef SUPPORT_UCP + int prop_type; + int prop_fail_result; + int prop_category; + int prop_chartype; + int prop_othercase; + int prop_test_against; + int *prop_test_variable; +#endif + int ctype; int length; int max; @@ -5696,6 +6855,14 @@ i, and fc and c, can be the same variables. */ eptrblock newptrb; #endif +/* These statements are here to stop the compiler complaining about unitialized +variables. */ + +#ifdef SUPPORT_UCP + prop_fail_result = 0; + prop_test_against = 0; + prop_test_variable = NULL; +#endif /* OK, now we can get on with the real code of the function. Recursion is specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined, @@ -5708,6 +6875,7 @@ performance when true recursion is being used. */ RRETURN(PCRE_ERROR_MATCHLIMIT); original_ims = ims; /* Save for resetting on ')' */ + utf8 = md->utf8; /* Local copy of the flag */ /* At the start of a bracketed group, add the current subject pointer to the stack of such pointers, to be re-instated at the end of the group when we hit @@ -5726,6 +6894,12 @@ this stack. */ op = *ecode; minimize = FALSE; + /* For partial matching, remember if we ever hit the end of the subject after + matching at least one subject character. */ + + if (md->partial && eptr >= md->end_subject && eptr > md->start_match) + md->hitend = TRUE; + /* Opening capturing bracket. If there is space in the offset vector, save the current subject position in the working slot at the top of the vector. We mustn't change the current values of the data slot, because they may be set @@ -5750,14 +6924,13 @@ this stack. */ number = GET2(ecode, 2 + LINK_SIZE); offset = number << 1; + if (offset < md->offset_max) { save_offset1 = md->offset_vector[offset]; save_offset2 = md->offset_vector[offset + 1]; save_offset3 = md->offset_vector[md->offset_end - number]; save_capture_last = md->capture_last; - DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, - save_offset3)); md->offset_vector[md->offset_end - number] = eptr - md->start_subject; do { @@ -5770,7 +6943,6 @@ this stack. */ } while (*ecode == OP_ALT); - DPRINTF(("bracket %d failed\n", number)); md->offset_vector[offset] = save_offset1; md->offset_vector[offset + 1] = save_offset2; @@ -5789,7 +6961,6 @@ this stack. */ switch (op) { case OP_BRA: /* Non-capturing bracket: optimized */ - DPRINTF(("start bracket 0\n")); do { RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, match_isgroup); @@ -5798,7 +6969,6 @@ this stack. */ ecode += GET(ecode, 1); } while (*ecode == OP_ALT); - DPRINTF(("bracket 0 failed\n")); RRETURN(MATCH_NOMATCH); /* Conditional group: compilation checked that there are no more than @@ -5853,7 +7023,6 @@ this stack. */ case OP_END: if (md->recursive != NULL && md->recursive->group_num == 0) { recursion_info *rec = md->recursive; - DPRINTF(("Hit the end in a (?0) recursion\n")); md->recursive = rec->prevrec; memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); @@ -5877,7 +7046,6 @@ this stack. */ case OP_OPT: ims = ecode[1]; ecode += 2; - DPRINTF(("ims set to %02lx\n", ims)); break; /* Assertion brackets. Check the alternative branches in turn - the @@ -5964,13 +7132,15 @@ this stack. */ case OP_CALLOUT: if (pcre_callout != NULL) { pcre_callout_block cb; - cb.version = 0; /* Version 0 of the callout block */ + cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; cb.subject = (const char *) md->start_subject; cb.subject_length = md->end_subject - md->start_subject; cb.start_match = md->start_match - md->start_subject; cb.current_position = eptr - md->start_subject; + cb.pattern_position = GET(ecode, 2); + cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top / 2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; @@ -5979,7 +7149,7 @@ this stack. */ if (rrc < 0) RRETURN(rrc); } - ecode += 2; + ecode += 2 + 2 * LINK_SIZE; break; /* Recursion either matches the current regex, or some subexpression. The @@ -6042,7 +7212,6 @@ this stack. */ /* OK, now we can do the recursion. For each top-level alternative we restore the offset and recursion data. */ - DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); do { RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, eptrb, match_isgroup); @@ -6061,7 +7230,6 @@ this stack. */ } while (*callpat == OP_ALT); - DPRINTF(("Recursion didn't match\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) free(new_recursive.offset_save); @@ -6125,7 +7293,6 @@ this stack. */ if (ecode[1 + LINK_SIZE] == OP_OPT) { ims = (ims & ~PCRE_IMS) | ecode[4]; - DPRINTF(("ims set to %02lx at group repeat\n", ims)); } if (*ecode == OP_KETRMIN) { @@ -6229,6 +7396,11 @@ this stack. */ number = GET2(prev, 2 + LINK_SIZE); offset = number << 1; +#ifdef DEBUG + printf("end bracket %d", number); + printf("\n"); +#endif + /* Test for a numbered group. This includes groups called as a result of recursion. Note that whole-pattern recursion is coded as a recurse into group 0, so it won't be picked up here. Instead, we catch it when @@ -6251,7 +7423,6 @@ this stack. */ if (md->recursive != NULL && md->recursive->group_num == number) { recursion_info *rec = md->recursive; - DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); md->recursive = rec->prevrec; md->start_match = rec->save_start; memcpy(md->offset_vector, rec->offset_save, @@ -6267,7 +7438,6 @@ this stack. */ the group. */ ims = original_ims; - DPRINTF(("ims reset to %02lx\n", ims)); /* For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. @@ -6480,6 +7650,64 @@ this stack. */ ecode++; break; +#ifdef SUPPORT_UCP + /* Check the next character by Unicode property. We will get here only + if the support is in the binary; otherwise a compile-time error occurs. */ + + case OP_PROP: + case OP_NOTPROP: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + { + int chartype, rqdtype; + int othercase; + int category = _pcre_ucp_findchar(c, &chartype, &othercase); + + rqdtype = *(++ecode); + ecode++; + + if (rqdtype >= 128) { + if ((rqdtype - 128 != category) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + } else { + if ((rqdtype != chartype) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + } + } + break; + + /* Match an extended Unicode sequence. We will get here only if the support + is in the binary; otherwise a compile-time error occurs. */ + + case OP_EXTUNI: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + { + int chartype; + int othercase; + int category = _pcre_ucp_findchar(c, &chartype, &othercase); + if (category == ucp_M) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + category = _pcre_ucp_findchar(c, &chartype, &othercase); + if (category != ucp_M) + break; + eptr += len; + } + } + ecode++; + break; +#endif + + /* Match a back reference, possibly repeatedly. Look past the end of the item to see if there is repeat information following. The code is similar to that for character classes, but repeated for efficiency. Then obey @@ -6595,9 +7823,11 @@ this stack. */ /* Match a bit-mapped character class, possibly repeatedly. This op code is - used when all the characters in the class have values in the range 0-255. - The only difference between OP_CLASS and OP_NCLASS occurs when a data - character outside the range is encountered. + used when all the characters in the class have values in the range 0-255, + and either the matching is caseful, or the characters are in the range + 0-127 when UTF-8 processing is enabled. The only difference between + OP_CLASS and OP_NCLASS occurs when a data character outside the range is + encountered. First, look past the end of the item to see if there is repeat information following. Then obey similar code to character type repeats - written out @@ -6711,25 +7941,31 @@ this stack. */ in UTF-8 mode, because that's the only time it is compiled. */ - /* Match a run of characters */ + /* Match a single character, casefully */ + + case OP_CHAR: - case OP_CHARS: + /* Non-UTF-8 mode */ { - register int slen = ecode[1]; + if (md->end_subject - eptr < 1) + RRETURN(MATCH_NOMATCH); + if (ecode[1] != *eptr++) + RRETURN(MATCH_NOMATCH); ecode += 2; + } + break; + /* Match a single character, caselessly */ - if (slen > md->end_subject - eptr) + case OP_CHARNC: + + /* Non-UTF-8 mode */ + { + if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); - if ((ims & PCRE_CASELESS) != 0) { - while (slen-- > 0) - if (md->lcc[*ecode++] != md->lcc[*eptr++]) - RRETURN(MATCH_NOMATCH); - } else { - while (slen-- > 0) - if (*ecode++ != *eptr++) - RRETURN(MATCH_NOMATCH); - } + if (md->lcc[ecode[1]] != md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + ecode += 2; } break; @@ -6783,8 +8019,6 @@ this stack. */ matching character if failing, up to the maximum. Alternatively, if maximizing, find the maximum number of characters and work backwards. */ - DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, - max, eptr)); if ((ims & PCRE_CASELESS) != 0) { fc = md->lcc[fc]; @@ -6906,9 +8140,9 @@ this stack. */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-character (less than 255) matches. - We can give up quickly if there are fewer than the minimum number of - characters left in the subject. */ + /* Common code for all repeated single-byte matches. We can give up quickly + if there are fewer than the minimum number of bytes left in the + subject. */ REPEATNOTCHAR: if (min > md->end_subject - eptr) @@ -6923,9 +8157,6 @@ this stack. */ maximum. Alternatively, if maximizing, find the maximum number of characters and work backwards. */ - DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, - max, max, eptr)); - if ((ims & PCRE_CASELESS) != 0) { fc = md->lcc[fc]; @@ -7070,69 +8301,131 @@ this stack. */ REPEATTYPE: ctype = *ecode++; /* Code for the character type */ +#ifdef SUPPORT_UCP + if (ctype == OP_PROP || ctype == OP_NOTPROP) { + prop_fail_result = ctype == OP_NOTPROP; + prop_type = *ecode++; + if (prop_type >= 128) { + prop_test_against = prop_type - 128; + prop_test_variable = &prop_category; + } else { + prop_test_against = prop_type; + prop_test_variable = &prop_chartype; + } + } else + prop_type = -1; +#endif + /* First, ensure the minimum number of matches are present. Use inline code for maximizing the speed, and do the type test once at the start (i.e. keep it out of the loop). Also we can test that there are at least the minimum number of bytes before we start. This isn't as effective in UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that - is tidier. */ + is tidier. Also separate the UCP code, which can be the same for both UTF-8 + and single-bytes. */ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); if (min > 0) { +#ifdef SUPPORT_UCP + if (prop_type > 0) { + for (i = 1; i <= min; i++) { + GETCHARINC(c, eptr); + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if ((*prop_test_variable == prop_test_against) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + } - /* Code for the non-UTF-8 case for minimum matching */ + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ - switch (ctype) { - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) { - for (i = 1; i <= min; i++) - if (*eptr++ == NEWLINE) - RRETURN(MATCH_NOMATCH); - } else + else if (ctype == OP_EXTUNI) { + for (i = 1; i <= min; i++) { + GETCHARINCTEST(c, eptr); + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category == ucp_M) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category != ucp_M) + break; + eptr += len; + } + } + } + + else +#endif /* SUPPORT_UCP */ + +/* Handle all other cases when the coding is UTF-8 */ + + + /* Code for the non-UTF-8 case for minimum matching of operators other + than OP_PROP and OP_NOTPROP. */ + + switch (ctype) { + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) { + for (i = 1; i <= min; i++) + if (*eptr++ == NEWLINE) + RRETURN(MATCH_NOMATCH); + } else + eptr += min; + break; + + case OP_ANYBYTE: eptr += min; - break; + break; - case OP_ANYBYTE: - eptr += min; - break; + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + break; - case OP_NOT_DIGIT: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - break; + case OP_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + break; - case OP_DIGIT: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - break; + case OP_NOT_WHITESPACE: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + break; - case OP_NOT_WHITESPACE: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - break; + case OP_WHITESPACE: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + break; - case OP_WHITESPACE: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - break; + case OP_NOT_WORDCHAR: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; - case OP_NOT_WORDCHAR: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - break; + case OP_WORDCHAR: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; - case OP_WORDCHAR: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - break; - } + default: + RRETURN(PCRE_ERROR_INTERNAL); + } } /* If min = max, continue at the same level without recursing */ @@ -7141,10 +8434,61 @@ this stack. */ continue; /* If minimizing, we have to test the rest of the pattern before each - subsequent match. Again, separate the UTF-8 case for speed. */ + subsequent match. Again, separate the UTF-8 case for speed, and also + separate the UCP cases. */ if (minimize) { - /* Not UTF-8 mode */ +#ifdef SUPPORT_UCP + if (prop_type > 0) { + for (fi = min;; fi++) { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if ((*prop_test_variable == prop_test_against) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (ctype == OP_EXTUNI) { + for (fi = min;; fi++) { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category == ucp_M) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category != ucp_M) + break; + eptr += len; + } + } + } + + else +#endif /* SUPPORT_UCP */ + + /* Not UTF-8 mode */ { for (fi = min;; fi++) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -7191,6 +8535,9 @@ this stack. */ if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); } } } @@ -7199,13 +8546,94 @@ this stack. */ /* If maximizing it is worth using inline code for speed, doing the type test once at the start (i.e. keep it out of the loop). Again, keep the - UTF-8 stuff separate. */ + UTF-8 and UCP stuff separate. */ else { - pp = eptr; + pp = eptr; /* Remember where we started */ +#ifdef SUPPORT_UCP + if (prop_type > 0) { + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if ((*prop_test_variable == prop_test_against) == prop_fail_result) + break; + eptr += len; + } - /* Not UTF-8 mode */ + /* eptr is now past the end of the maximum run */ + + for (;;) { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (ctype == OP_EXTUNI) { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + GETCHARINCTEST(c, eptr); + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category == ucp_M) + break; + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category != ucp_M) + break; + eptr += len; + } + } + + /* eptr is now past the end of the maximum run */ + + for (;;) { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + for (;;) { /* Move back over one extended */ + int len = 1; + BACKCHAR(eptr); + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = + _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + if (prop_category != ucp_M) + break; + eptr--; + } + } + } + + else +#endif /* SUPPORT_UCP */ + + + /* Not UTF-8 mode */ { switch (ctype) { case OP_ANY: @@ -7279,6 +8707,9 @@ this stack. */ eptr++; } break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ @@ -7303,7 +8734,6 @@ this stack. */ in the code above or the OP_xxx definitions. */ default: - DPRINTF(("Unknown opcode %d\n", *ecode)); RRETURN(PCRE_ERROR_UNKNOWN_NODE); } @@ -7333,7 +8763,6 @@ Undefine all the macros that were defined above to handle this. */ #undef callpat #undef charptr #undef data -#undef lastptr #undef next #undef pp #undef prev @@ -7384,7 +8813,7 @@ portions of the string if it matches. Two elements in the vector are set for each substring: the offsets to the start and end of the substring. Arguments: - external_re points to the compiled expression + argument_re points to the compiled expression extra_data points to extra data or is NULL subject points to the subject string length length of subject string (may contain binary zeros) @@ -7399,8 +8828,8 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ -EXPORT int -pcre_exec(const pcre * external_re, const pcre_extra * extra_data, +int +pcre_exec(const pcre * argument_re, const pcre_extra * extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount) { @@ -7412,15 +8841,22 @@ pcre_exec(const pcre * external_re, const pcre_extra * extra_data, BOOL using_temporary_offsets = FALSE; BOOL anchored; BOOL startline; + BOOL firstline; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; match_data match_block; + const uschar *tables; const uschar *start_bits = NULL; const uschar *start_match = (const uschar *) subject + start_offset; const uschar *end_subject; const uschar *req_byte_ptr = start_match - 1; + + pcre_study_data internal_study; const pcre_study_data *study; - const real_pcre *re = (const real_pcre *) external_re; + + real_pcre internal_re; + const real_pcre *external_re = (const real_pcre *) argument_re; + const real_pcre *re = external_re; /* Plausibility checks */ @@ -7428,6 +8864,8 @@ pcre_exec(const pcre * external_re, const pcre_extra * extra_data, return PCRE_ERROR_BADOPTION; if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; + if (offsetcount < 0) + return PCRE_ERROR_BADCOUNT; /* Fish out the optional data from the extra_data structure, first setting the default values. */ @@ -7436,6 +8874,10 @@ the default values. */ match_block.match_limit = MATCH_LIMIT; match_block.callout_data = NULL; +/* The table pointer is always in native byte order. */ + + tables = external_re->tables; + if (extra_data != NULL) { register unsigned int flags = extra_data->flags; if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) @@ -7444,19 +8886,42 @@ the default values. */ match_block.match_limit = extra_data->match_limit; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) match_block.callout_data = extra_data->callout_data; + if ((flags & PCRE_EXTRA_TABLES) != 0) + tables = extra_data->tables; } -/* Now we have re supposedly pointing to the regex */ +/* If the exec call supplied NULL for tables, use the inbuilt ones. This +is a feature that makes it possible to save compiled regex and re-use them +in other programs later. */ + + if (tables == NULL) + tables = _pcre_default_tables; + +/* Check that the first field in the block is the magic number. If it is not, +test for a regex that was compiled on a host of opposite endianness. If this is +the case, flipped values are put in internal_re and internal_study if there was +study data too. */ + + if (re->magic_number != MAGIC_NUMBER) { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) + return PCRE_ERROR_BADMAGIC; + if (study != NULL) + study = &internal_study; + } - if (re->magic_number != MAGIC_NUMBER) - return PCRE_ERROR_BADMAGIC; +/* Set up other data */ anchored = ((re->options | options) & PCRE_ANCHORED) != 0; startline = (re->options & PCRE_STARTLINE) != 0; + firstline = (re->options & PCRE_FIRSTLINE) != 0; + +/* The code starts after the real_pcre block and the capture name table. */ match_block.start_code = - (const uschar *) re + sizeof(real_pcre) + + (const uschar *) external_re + re->name_table_offset + re->name_count * re->name_entry_size; + match_block.start_subject = (const uschar *) subject; match_block.start_offset = start_offset; match_block.end_subject = match_block.start_subject + length; @@ -7468,11 +8933,22 @@ the default values. */ match_block.notbol = (options & PCRE_NOTBOL) != 0; match_block.noteol = (options & PCRE_NOTEOL) != 0; match_block.notempty = (options & PCRE_NOTEMPTY) != 0; + match_block.partial = (options & PCRE_PARTIAL) != 0; + match_block.hitend = FALSE; match_block.recursive = NULL; /* No recursion at top level */ - match_block.lcc = re->tables + lcc_offset; - match_block.ctypes = re->tables + ctypes_offset; + match_block.lcc = tables + lcc_offset; + match_block.ctypes = tables + ctypes_offset; + +/* Partial matching is supported only for a restricted set of regexes at the +moment. */ + + if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0) + return PCRE_ERROR_BADPARTIAL; + +/* Check a UTF-8 string if required. Unfortunately there's no way of passing +back the character offset. */ /* The ims options can vary during the matching as a result of the presence @@ -7482,7 +8958,7 @@ restoring at the exit of a group is easy. */ ims = re->options & (PCRE_CASELESS | PCRE_MULTILINE | PCRE_DOTALL); /* If the expression has got more back references than the offsets supplied can -hold, we get a temporary bit of working store to use during the matching. +hold, we get a temporary chunk of working store to use during the matching. Otherwise, we can use the vector supplied, rounding down its size to a multiple of 3. */ @@ -7494,7 +8970,6 @@ of 3. */ if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; using_temporary_offsets = TRUE; - DPRINTF(("Got memory to hold back references\n")); } else match_block.offset_vector = offsets; @@ -7546,22 +9021,38 @@ character" set. */ if ((re->options & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; - req_byte2 = (re->tables + fcc_offset)[req_byte]; /* case flipped */ + req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ } /* Loop for handling unanchored repeated matching attempts; for anchored regexs the loop runs just once. */ do { - register int *iptr = match_block.offset_vector; - register int *iend = iptr + resetcount; + const uschar *save_end_subject = end_subject; /* Reset the maximum number of extractions we might see. */ - while (iptr < iend) - *iptr++ = -1; + if (match_block.offset_vector != NULL) { + register int *iptr = match_block.offset_vector; + register int *iend = iptr + resetcount; + while (iptr < iend) + *iptr++ = -1; + } + + /* Advance to a unique first char if possible. If firstline is TRUE, the + start of the match is constrained to the first line of a multiline string. + Implement this by temporarily adjusting end_subject so that we stop scanning + at a newline. If the match fails at the newline, later code breaks this loop. + */ + + if (firstline) { + const uschar *t = start_match; + while (t < save_end_subject && *t != '\n') + t++; + end_subject = t; + } - /* Advance to a unique first char if possible */ + /* Now test for a unique first byte */ if (first_byte >= 0) { if (first_byte_caseless) @@ -7586,7 +9077,7 @@ the loop runs just once. */ else if (start_bits != NULL) { while (start_match < end_subject) { - register int c = *start_match; + register unsigned int c = *start_match; if ((start_bits[c / 8] & (1 << (c & 7))) == 0) start_match++; else @@ -7594,6 +9085,16 @@ the loop runs just once. */ } } + /* Restore fudged end_subject */ + + end_subject = save_end_subject; + +#ifdef DEBUG /* Sigh. Some compilers never learn. */ + printf(">>>> Match against: "); + pchars(start_match, end_subject - start_match, TRUE, &match_block); + printf("\n"); +#endif + /* If req_byte is set, we know that that character must appear in the subject for the match to succeed. If the first character is set, req_byte must be later in the subject; otherwise the test starts at the match point. This @@ -7605,9 +9106,13 @@ the loop runs just once. */ HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This showed up when somebody was matching /^C/ on a 32-megabyte string... so we - don't do this when the string is sufficiently long. */ + don't do this when the string is sufficiently long. - if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) { + ALSO: this processing is disabled when partial matching is requested. + */ + + if (req_byte >= 0 && + end_subject - start_match < REQ_BYTE_MAX && !match_block.partial) { register const uschar *p = start_match + ((first_byte >= 0) ? 1 : 0); /* We don't need to repeat the search if we haven't yet reached the @@ -7657,13 +9162,21 @@ the loop runs just once. */ rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL, match_isgroup); + /* When the result is no match, if the subject's first character was a + newline and the PCRE_FIRSTLINE option is set, break (which will return + PCRE_ERROR_NOMATCH). The option requests that a match occur before the first + newline in the subject. Otherwise, advance the pointer to the next character + and continue - but the continuation will actually happen only when the + pattern is not anchored. */ + if (rc == MATCH_NOMATCH) { + if (firstline && *start_match == NEWLINE) + break; start_match++; continue; } if (rc != MATCH_MATCH) { - DPRINTF((">>>> error: returning %d\n", rc)); return rc; } @@ -7674,12 +9187,10 @@ the loop runs just once. */ if (offsetcount >= 4) { memcpy(offsets + 2, match_block.offset_vector + 2, (offsetcount - 2) * sizeof(int)); - DPRINTF(("Copied offsets from temporary memory\n")); } if (match_block.end_offset_top > offsetcount) match_block.offset_overflow = TRUE; - DPRINTF(("Freeing temporary memory\n")); free(match_block.offset_vector); } @@ -7692,7 +9203,6 @@ the loop runs just once. */ offsets[1] = match_block.end_match_ptr - match_block.start_subject; } - DPRINTF((">>>> returning %d\n", rc)); return rc; } @@ -7701,13 +9211,14 @@ the loop runs just once. */ while (!anchored && start_match <= end_subject); if (using_temporary_offsets) { - DPRINTF(("Freeing temporary memory\n")); free(match_block.offset_vector); } - DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); - - return PCRE_ERROR_NOMATCH; + if (match_block.partial && match_block.hitend) { + return PCRE_ERROR_PARTIAL; + } else { + return PCRE_ERROR_NOMATCH; + } } -/* End of pcre.c */ +/* End of pcre_exec.c */