/* $Header: /home/agc/src/libutf-2.10/RCS/ure.c,v 1.23 1997/10/20 15:06:34 agc Exp agc $ */ /* * Copyright © 1996-1997 Alistair G. Crooks. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Alistair G. Crooks. * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #ifdef HAVE_SYS_TYPES_H #include #endif #include #include #ifdef HAVE_STDLIB_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_STRING_H #include #endif #ifdef HAVE_LIMITS_H #include #endif #include "utf.h" #include "ure.h" /*************************************************************************/ /* structural regular expression routines */ /* * Changed by agc for various uses to include URE_Subst, * and to make the functions use ANSI C type arguments. * Subsequent changes to: * - make regexps into SREs * - include \<\>, \s\S, \w\W, \p\P, \d\D recognition * - added '@' metacharacter for any char including \n * - added {min,max} recognition of strings * - do case insensitive searching on execution, not compilation * - added SRE_SetAlphabet, and use the alphabets * - calculate character ranges on execution, not compilation * - modified to use 16-bit unicode characters, rename SRE to URE. */ /* * urecomp and ureexec -- URE_Subst and ureerror are elsewhere * * Copyright (c) 1986 by University of Toronto. * Written by Henry Spencer. Not derived from licensed software. * * Permission is granted to anyone to use this software for any * purpose on any computer system, and to redistribute it freely, * subject to the following restrictions: * * 1. The author is not responsible for the consequences of use of * this software, no matter how awful, even if they arise * from defects in it. * * 2. The origin of this software must not be misrepresented, either * by explicit claim or by omission. * * 3. Altered versions must be plainly marked as such, and must not * be misrepresented as being the original software. * * Beware that some of this code is subtly aware of the way operator * precedence is structured in regular expressions. Serious changes in * regular-expression syntax might require a total rethink. */ /* * The "internal use only" fields in regexp.h are present to pass info from * compile to execute that permits the execute phase to run lots faster on * simple cases. They are: * * u_start char that must begin a match; '\0' if none obvious * u_anch is the match anchored (at beginning-of-line only)? * u_must string (pointer into u_prog) that match must include, or NULL * u_mlen length of u_must string * * u_start and u_anch permit very fast decisions on suitable * starting points for a match, cutting down the work a lot. u_must * permits fast rejection of lines that cannot possibly match. The * u_must tests are costly enough that urecomp() supplies a * u_must only if the r.e. contains something potentially expensive * (at present, the only such thing detected is * or + at the start of * the r.e., which can involve a lot of backup). u_mlen is supplied * because the test in ureexec() needs it and urecomp() is * computing it anyway. */ /* * Structure for URE "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or * "railroad normal form" in parsing technology). Each node is an opcode * plus a "next" pointer, possibly plus an operand. "Next" pointers of * all nodes except BRANCH implement concatenation; a "next" pointer with * a BRANCH on both ends of it is connecting two alternatives. (Here we * have one of the subtle syntax dependencies: an individual BRANCH (as * opposed to a collection of them) is never concatenated with anything * because of operator precedence.) The operand of some types of node is * a literal string; for others, it is a node leading into a sub-FSM. In * particular, the operand of a BRANCH node is the first node of the branch. * (NB this is *not* a tree structure: the tail of the branch connects * to the thing following the set of BRANCHes.) The opcodes are: */ enum URE_Opcodes { END, /* no End of program. */ BOL, /* no Match "" at beginning of line. */ EOL, /* no Match "" at end of line. */ ANYNONL, /* no Match any one character (not \n). */ ANYOF, /* str Match any character in this string. */ ANYBUT, /* str Match any character not in this string. */ BRANCH, /* node Match this alternative, or the next... */ BACK, /* no Match "", "next" ptr points backward. */ EXACTLY, /* str Match this string. */ NOTHING, /* no Match empty string. */ STAR, /* node Match this (simple) thing 0 or more times. */ PLUS, /* node Match this (simple) thing 1 or more times. */ BEGWORD, /* node Match "" between nonword and word. */ ENDWORD, /* node Match "" between word and nonword. */ WHITESP, /* node Match any whitespace character */ NWHITESP, /* node Match any nonwhitespace character */ ALNUM, /* node Match any alphanumeric, include _ */ NALNUM, /* node inverse above, including BOL and EOL */ DIGIT, /* node Match any digit */ NDIGIT, /* node Match any non-digit */ PRINT, /* node Match any printable char (including whitesp) */ NPRINT, /* node Match any non-printable char */ MINMAX, /* node Match this thing (min, max) times */ ANY, /* no Match any character, even \n */ BORDER, /* node Match "" border between nonword and word. */ NBORDER, /* node Match "" non-border between nonword and word. */ CLASSRANGE, /* used internally for class range matching */ CLASSPOSIX, /* used internally for posix class matching */ CLASSRUNE, /* used internally for class rune matching */ OPEN = 0x2000, /* no Mark this point in input as start of #n. */ /* OPEN+1 is number 1, etc. */ CLOSE = 0x4000 /* no Analogous to OPEN. */ }; /* * Opcode notes: * * BRANCH The set of branches constituting a single choice are hooked * together with their "next" pointers, since precedence prevents * anything being concatenated to any individual branch. The * "next" pointer of the last BRANCH in a choice points to the * thing following the whole choice. This is also where the * final "next" pointer of each individual branch points; each * branch starts with the operand node of a BRANCH node. * * BACK Normal "next" pointers all implicitly point forward; BACK * exists to make loop structures possible. * * STAR,PLUS '?', and complex '*' and '+', are implemented as circular * BRANCH structures using BACK. Simple cases (one character * per match) are implemented with STAR and PLUS for speed * and to minimize recursive plunges. * * OPEN,CLOSE ...are numbered at compile time. */ /* * This used to be in "regmagic.h" - agc */ /* * The first rune of the ure_t internal "program" is actually this magic * number; the start node begins in the second byte. */ #define URE_MAGIC 0x2341 /* * A node is one char of opcode followed by two chars of "next" pointer. * "Next" pointers are stored as two 16-bit pieces, high order first. The * value is a positive offset from the opcode of the node containing it. * An operand, if any, simply follows the node. (Note that much of the * code generation knows about this implicit relationship.) * * Using four bytes for the "next" pointer is VAST overkill for most things, * but allows patterns to get big without disasters. */ #define OP(p) (*(p)) #define NEXT(p) (((*((p)+1) & 0xffff)<<16) + (*((p)+2) & 0xffff)) #define OPERAND(p) ((p) + 3) /* * Utility definitions. */ #define UCHARAT(p) (*(p)) #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{') #define META "^$.@[()|?+*\\{" /* * Flags to be passed up and down. */ #define HASWIDTH 01 /* Known never to match null string. */ #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ #define SPSTART 04 /* Starts with * or +. */ #define WORST 0 /* Worst case. */ /* This struct describes the internals of utf re's. Used in compiling */ typedef struct gubbins { unsigned int g_parc; /* # of parens */ long g_size; /* size of expr */ Rune *g_code; /* pointer to position in expr */ char *g_parse; /* position in input string */ } gubbins_t; /* Sorry, my little boy's going through an "anatomy" phase. */ /* used in matching phase */ typedef struct tummy { char *t_input; /* input string */ char *t_bol; /* pointer to beginning of line */ char *t_start; /* start of string */ int t_matchc; /* # of slots in t_matchv */ urematch_t *t_matchv; /* matching array */ ureoff_t t_off; /* offset from start of string */ } tummy_t; /* maximum number of distinct ranges that can be specified in one class */ #ifndef MAXRANGE #define MAXRANGE 1024 #endif /* * Forward declarations for urecomp()'s friends. */ static Rune *ure(int paren, int *flagp, int *ep, int sizing, gubbins_t *gp); /* - ureNext - dig the "next" pointer out of a node */ static Rune * ureNext(Rune *p, int sizing) { register int offset; if (sizing) { return NULL; } offset = NEXT(p); if (offset == 0) { return NULL; } if (OP(p) == BACK) { return p-offset; } else { return p+offset; } } /* - ureTail - set the next-pointer at the end of a node chain */ static void ureTail(Rune *p, Rune *val, int sizing, gubbins_t *gp) { register Rune *scan; register Rune *temp; register int offset; if (!sizing) { /* Find last node. */ scan = p; for (;;) { temp = ureNext(scan, sizing); if (temp == NULL) { break; } scan = temp; } if (OP(scan) == BACK) { offset = scan - val; } else { offset = val - scan; } *(scan+1) = (offset>>16) & 0xffff; *(scan+2) = offset & 0xffff; } } /* - ureOpTail - ureTail on operand of first argument; nop if operandless */ static void ureOpTail(Rune *p, Rune *val, int sizing, gubbins_t *gp) { /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p != NULL && !sizing && OP(p) == BRANCH) { ureTail(OPERAND(p), val, sizing, gp); } } /* - ureInsert - insert an operator in front of already-emitted operand * * Means relocating the operand. */ static void ureInsert(Rune op, Rune *opnd, int sizing, gubbins_t *gp) { register Rune *src; register Rune *dst; register Rune *place; if (sizing) { gp->g_size += 3; } else { src = gp->g_code; gp->g_code += 3; dst = gp->g_code; while (src > opnd) { *--dst = *--src; } place = opnd; /* Op node, where operand used to be. */ *place++ = op; *place++ = '\0'; *place++ = '\0'; } } /* - ureC - emit (if appropriate) a byte of code */ static void ureC(Rune b, int sizing, gubbins_t *gp) { if (sizing) { gp->g_size++; } else { *gp->g_code++ = b; } } /* - ureNode - emit a node */ static Rune * /* Location. */ ureNode(Rune op, int sizing, gubbins_t *gp) { register Rune *ret; register Rune *ptr; if (sizing) { gp->g_size += 3; return gp->g_code; } ptr = ret = gp->g_code; *ptr++ = op; *ptr++ = '\0'; /* Null "next" pointer. */ *ptr++ = '\0'; gp->g_code = ptr; return ret; } static Rune * compileRange(int sizing, gubbins_t *gp, int *errp) { Rune rangev[MAXRANGE][2]; Rune *ret; Rune r2; Rune r; int rangec; int i; i = chartorune(&r, gp->g_parse); if (r == '^') { /* Complement of range. */ ret = ureNode(ANYBUT, sizing, gp); gp->g_parse += i; i = chartorune(&r, gp->g_parse); } else { ret = ureNode(ANYOF, sizing, gp); } if (r == ']' || r == '-') { ureC(r, sizing, gp); gp->g_parse += i; i = chartorune(&r, gp->g_parse); } for (rangec = 0 ; r != 0 && r != ']' ; ) { /* XXX agc escape \\ ??? XXX */ (void) chartorune(&r2, gp->g_parse + i); if (r2 == '-') { rangev[rangec][0] = r; gp->g_parse += i; i = chartorune(&r, gp->g_parse); gp->g_parse += i; i = chartorune(&r, gp->g_parse); rangev[rangec++][1] = r; } else { ureC(r, sizing, gp); } gp->g_parse += i; i = chartorune(&r, gp->g_parse); } if (r != ']') { *errp = URE_ERR_UNMATCHED_BRACKET; return (Rune *) NULL; } gp->g_parse += i; ureC(0, sizing, gp); ureC(rangec, sizing, gp); for (i = 0 ; i < rangec ; i++) { ureC(rangev[i][0], sizing, gp); ureC(rangev[i][1], sizing, gp); } return ret; } /* if (strncmp(gp->g_parse, "alnum:]", 7) == 0) { } else if (strncmp(gp->g_parse, "alpha:]", 7) == 0) { } else if (strncmp(gp->g_parse, "blank:]", 7) == 0) { } else if (strncmp(gp->g_parse, "cntrl:]", 7) == 0) { } else if (strncmp(gp->g_parse, "digit:]", 7) == 0) { } else if (strncmp(gp->g_parse, "graph:]", 7) == 0) { } else if (strncmp(gp->g_parse, "lower:]", 7) == 0) { } else if (strncmp(gp->g_parse, "print:]", 7) == 0) { } else if (strncmp(gp->g_parse, "punct:]", 7) == 0) { } else if (strncmp(gp->g_parse, "space:]", 7) == 0) { } else if (strncmp(gp->g_parse, "upper:]", 7) == 0) { } else if (strncmp(gp->g_parse, "xdigit:]", 8) == 0) { } */ /* - ureAtom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static Rune * ureAtom(int *flagp, int *errp, int sizing, gubbins_t *gp) { register Rune *ret; Rune r; int flags; int i; *flagp = WORST; /* Tentatively. */ i = chartorune(&r, gp->g_parse); gp->g_parse += i; switch (r) { case '^': ret = ureNode(BOL, sizing, gp); break; case '$': ret = ureNode(EOL, sizing, gp); break; case '.': ret = ureNode(ANYNONL, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case '@': ret = ureNode(ANY, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case '[': if ((ret = compileRange(sizing, gp, errp)) == (Rune *) NULL) { return (Rune *) NULL; } *flagp |= HASWIDTH|SIMPLE; break; case '(': if ((ret = ure(1, &flags, errp, sizing, gp)) == (Rune *) NULL) { return (Rune *) NULL; } *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case ')': /* Supposed to be caught earlier. */ *errp = URE_ERR_INTERNAL_MESS; return (Rune *) NULL; case '?': case '+': case '*': case '{': *errp = URE_ERR_REPEAT_FOLLOWS_NOTHING; return (Rune *) NULL; case '\\': i = chartorune(&r, gp->g_parse); if (r == 0) { *errp = URE_ERR_TRAILING_BACKSLASH; return (Rune *) NULL; } switch(r) { case '<': ret = ureNode(BEGWORD, sizing, gp); break; case '>': ret = ureNode(ENDWORD, sizing, gp); break; case 'b': ret = ureNode(BORDER, sizing, gp); break; case 'B': ret = ureNode(NBORDER, sizing, gp); break; case 'd': ret = ureNode(DIGIT, sizing, gp); *flagp |= (HASWIDTH|SIMPLE); break; case 'D': ret = ureNode(NDIGIT, sizing, gp); *flagp |= (HASWIDTH|SIMPLE); break; case 'n' : ret = ureNode(EXACTLY, sizing, gp); ureC('\n', sizing, gp); ureC('\0', sizing, gp); *flagp |= (HASWIDTH|SIMPLE); break; case 'p': ret = ureNode(PRINT, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case 'P': ret = ureNode(NPRINT, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case 's': ret = ureNode(WHITESP, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case 'S': ret = ureNode(NWHITESP, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case 't' : ret = ureNode(EXACTLY, sizing, gp); ureC('\t', sizing, gp); ureC('\0', sizing, gp); *flagp |= (HASWIDTH|SIMPLE); break; case 'w': ret = ureNode(ALNUM, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; case 'W': ret = ureNode(NALNUM, sizing, gp); *flagp |= HASWIDTH|SIMPLE; break; default : if (r == 'u' && isxdigit(*(gp->g_parse + i)) && isxdigit(*(gp->g_parse + i + 1)) && isxdigit(*(gp->g_parse + i + 2)) && isxdigit(*(gp->g_parse + i + 3))) { /* check next 4 chars are hex digits, so rune */ r = AsciiToNumber(gp->g_parse + i, 4, 16); gp->g_parse += 4; } ret = ureNode(EXACTLY, sizing, gp); ureC(r, sizing, gp); ureC('\0', sizing, gp); *flagp |= HASWIDTH|SIMPLE; } gp->g_parse += i; break; default: { register int len; Rune ender; int rc; gp->g_parse -= i; /* look for a meta in the rest of gp->g_parse */ len = utfcspan(gp->g_parse, META, &rc); if (len <= 0) { *errp = URE_ERR_INTERNAL_DISASTER; return (Rune *) NULL; } (void) chartorune(&ender, gp->g_parse + len); if (rc > 1 && ISMULT(ender)) { /* Back off clear of ?+* operand. */ rc--; } *flagp |= HASWIDTH; if (rc == 1) { *flagp |= SIMPLE; } ret = ureNode(EXACTLY, sizing, gp); while (rc > 0) { i = chartorune(&r, gp->g_parse); ureC(r, sizing, gp); gp->g_parse += i; rc--; } ureC(0, sizing, gp); } break; } return ret; } /* - urePiece - something followed by possible [*+?{] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static Rune * urePiece(int *flagp, int *errp, int sizing, gubbins_t *gp) { register Rune *next; register Rune *ret; Rune op; Rune r; Rune max; Rune min; int flags; int i; ret = ureAtom(&flags, errp, sizing, gp); if (ret == NULL) { return NULL; } i = chartorune(&op, gp->g_parse); if (!ISMULT(op)) { *flagp = flags; return ret; } if (!(flags&HASWIDTH) && op != '?') { *errp = URE_ERR_OPERAND_EMPTY; return (Rune *) NULL; } *flagp = (op != '+' && op != '{') ? (WORST|SPSTART) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) { ureInsert(STAR, ret, sizing, gp); } else if (op == '*') { /* Emit x* as (x&|), where & means "self". */ ureInsert(BRANCH, ret, sizing, gp); /* Either x */ ureOpTail(ret, ureNode(BACK, sizing, gp), sizing, gp); /* and loop */ ureOpTail(ret, ret, sizing, gp); /* back */ ureTail(ret, ureNode(BRANCH, sizing, gp), sizing, gp); /* or */ ureTail(ret, ureNode(NOTHING, sizing, gp), sizing, gp); /* null. */ } else if (op == '+' && (flags&SIMPLE)) { ureInsert(PLUS, ret, sizing, gp); } else if (op == '+') { /* Emit x+ as x(&|), where & means "self". */ next = ureNode(BRANCH, sizing, gp); /* Either */ ureTail(ret, next, sizing, gp); ureTail(ureNode(BACK, sizing, gp), ret, sizing, gp); /* loop back */ ureTail(next, ureNode(BRANCH, sizing, gp), sizing, gp); /* or */ ureTail(ret, ureNode(NOTHING, sizing, gp), sizing, gp); /* null. */ } else if (op == '{') { min = max = 0; gp->g_parse += i; for (;;) { i = chartorune(&r, gp->g_parse); if (r == 0 || !isdigit(r)) { break; } min = (min * 10) + (r - '0'); gp->g_parse += i; } gp->g_parse += i; for (;;) { i = chartorune(&r, gp->g_parse); if (r == 0 || !isdigit(r)) { break; } max = (max * 10) + (r - '0'); gp->g_parse += i; } ureInsert(max, ret, sizing, gp); next = OPERAND(ret); ureInsert(min, ret, sizing, gp); next = OPERAND(next); ureInsert(MINMAX, ret, sizing, gp); ureTail(ret, OPERAND(next), sizing, gp); /* MINMAX->next = x */ } else if (op == '?') { /* Emit x? as (x|) */ ureInsert(BRANCH, ret, sizing, gp); /* Either x */ ureTail(ret, ureNode(BRANCH, sizing, gp), sizing, gp); /* or */ next = ureNode(NOTHING, sizing, gp); /* null. */ ureTail(ret, next, sizing, gp); ureOpTail(ret, next, sizing, gp); } gp->g_parse += i; (void) chartorune(&r, gp->g_parse); if (ISMULT(r)) { *errp = URE_ERR_NESTED_REPEAT; return (Rune *) NULL; } return ret; } /* - ureBranch - one alternative of an | operator * * Implements the concatenation operator. */ static Rune * ureBranch(int *flagp, int *errp, int sizing, gubbins_t *gp) { register Rune *latest; register Rune *chain; register Rune *ret; Rune r; int flags; int i; *flagp = WORST; /* Tentatively. */ ret = ureNode(BRANCH, sizing, gp); chain = NULL; for (;;) { i = chartorune(&r, gp->g_parse); if (r == 0 || r == '|' || r == ')') { break; } latest = urePiece(&flags, errp, sizing, gp); if (latest == NULL) { return NULL; } *flagp |= flags&HASWIDTH; if (chain == NULL) { /* First piece. */ *flagp |= flags&SPSTART; } else { ureTail(chain, latest, sizing, gp); } chain = latest; } if (chain == NULL) { /* Loop ran zero times. */ (void) ureNode(NOTHING, sizing, gp); } return ret; } /* - ure - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static Rune * ure(int paren, int *flagp, int *errp, int sizing, gubbins_t *gp) { register Rune *ender; register Rune *ret; register Rune *br; register int parno; Rune r; int flags; int i; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (gp->g_parc >= URE_NSUBEXP) { *errp = URE_ERR_TOO_MANY_PARENS; return (Rune *) NULL; } parno = gp->g_parc++; ret = ureNode(OPEN+parno, sizing, gp); } else { ret = NULL; parno = 0; } /* Pick up the branches, linking them together. */ br = ureBranch(&flags, errp, sizing, gp); if (br == NULL) { return (Rune *) NULL; } if (ret != NULL) { ureTail(ret, br, sizing, gp); /* OPEN -> first. */ } else { ret = br; } if (!(flags&HASWIDTH)) { *flagp &= ~HASWIDTH; } *flagp |= flags&SPSTART; for (;;) { i = chartorune(&r, gp->g_parse); if (r != '|') { break; } gp->g_parse += i; br = ureBranch(&flags, errp, sizing, gp); if (br == NULL) { return (Rune *) NULL; } ureTail(ret, br, sizing, gp); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) { *flagp &= ~HASWIDTH; } *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = ureNode((paren) ? CLOSE+parno : END, sizing, gp); ureTail(ret, ender, sizing, gp); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = ureNext(br, sizing)) { ureOpTail(br, ender, sizing, gp); } /* Check for proper termination. */ i = chartorune(&r, gp->g_parse); if (paren) { gp->g_parse += i; if (r != ')') { *errp = URE_ERR_UNMATCHED_PAREN; return (Rune *) NULL; } } else if (!paren && r != 0) { if (r == ')') { *errp = URE_ERR_UNMATCHED_PAREN; return (Rune *) NULL; } /* "Can't happen". */ *errp = URE_ERR_JUNK_ON_END; return (Rune *) NULL; } return ret; } /* - urecomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled URE. */ int urecomp(ure_t *up, char *exp, int cflags) { register Rune *longest; register Rune *scan; register int len; gubbins_t g; Rune dummy; int flags; int err; if (exp == (char *) NULL) { return URE_ERR_NULL_ARG; } /* First pass: determine size, legality. */ g.g_parc = 1; g.g_size = 0L; /* arbirary value (non-null) for g_code here */ g.g_code = &dummy; g.g_parse = exp; ureC(URE_MAGIC, 1, &g); if (ure(0, &flags, &err, 1, &g) == NULL) { return err; } /* Small enough for pointer-storage convention? */ /* agc - actually, this limit is arbitrary - * just enough to trap errors */ if (g.g_size >= 0xffffff) { return URE_ERR_TOO_BIG; } /* Allocate space. */ up->u_prog = (Rune *)malloc((unsigned)(g.g_size * sizeof(Rune))); if (up->u_prog == (Rune *) NULL) { return URE_ERR_OUT_OF_SPACE; } /* Second pass: emit code. */ g.g_parc = 1; g.g_code = up->u_prog; g.g_parse = exp; ureC(URE_MAGIC, 0, &g); if (ure(0, &flags, &err, 0, &g) == NULL) { (void) free(up); return err; } up->u_subc = g.g_parc; /* Actual count of sub-expressions */ up->u_flags = cflags; /* Dig out information for optimizations. */ up->u_start = '\0'; /* Worst-case defaults. */ up->u_anch = 0; up->u_must = NULL; up->u_mlen = 0; scan = up->u_prog+1; /* First BRANCH */ if (OP(ureNext(scan, 0)) == END) { /* Only one top-level choice */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(scan) == EXACTLY) { up->u_start = *OPERAND(scan); } else if ((OP(scan) == BEGWORD || OP(scan) == BORDER) && OP(ureNext(scan, 0)) == EXACTLY) { up->u_start = *OPERAND(ureNext(scan, 0)); } else if (OP(scan) == BOL) { up->u_anch++; } /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * u_must. Resolve ties in favor of later strings, since * the u_start check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = ureNext(scan, 0)) { if (OP(scan) == EXACTLY && UNICODE_strlen(OPERAND(scan)) >= len) { longest = OPERAND(scan); len = UNICODE_strlen(OPERAND(scan)); } } up->u_must = longest; up->u_mlen = len; } } return URE_SUCCESS; } /* * ureexec and friends */ static int MatchClass(Rune *cl, Rune ch) { Rune *cp; int rangec; int i; if (UNICODE_strchr(cp = cl, ch) != (Rune *) NULL) { return 1; } cp += UNICODE_strlen(cp) + 1; for (rangec = *cp++, i = 0 ; i < rangec ; i++, cp += 2) { if (UNICODE_InRange(*cp, *(cp + 1), ch)) { return 1; } } return 0; } /* - ureRepeat - repeatedly match something simple, report how many */ static int ureRepeat(Rune *p, int *errp, tummy_t *tp) { register Rune *opnd; register char *scan; register int count = 0; Rune r; int len; scan = tp->t_input; opnd = OPERAND(p); switch (OP(p)) { case ANYNONL: /* make sure '.' doesn't match a \n character */ for (len = chartorune(&r, scan) ; r != 0 && r != '\n' ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case ANY: /* make sure '@' matches any character */ for (len = chartorune(&r, scan) ; r != 0 ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case EXACTLY: for (len = chartorune(&r, scan) ; *opnd == r ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case ANYOF: for (len = chartorune(&r, scan) ; r != 0 && MatchClass(opnd, r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case ANYBUT: for (len = chartorune(&r, scan) ; r != 0 && !MatchClass(opnd, r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case WHITESP : for (len = chartorune(&r, scan) ; r != 0 && UNICODE_isspace(r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case NWHITESP : for (len = chartorune(&r, scan) ; r != 0 && !UNICODE_isspace(r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case DIGIT : for (len = chartorune(&r, scan) ; r != 0 && UNICODE_isdigit(r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case NDIGIT : for (len = chartorune(&r, scan) ; r != 0 && !UNICODE_isdigit(r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case ALNUM : for (len = chartorune(&r, scan) ; r != 0 && UNICODE_isalnum(r) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case NALNUM : for (len = chartorune(&r, scan) ; r != 0 && !UNICODE_isalnum(r) ; ) { /* XXX - while (*scan != 0 && !UNICODE_IsIdent(*scan)) */ count++; scan += len; len = chartorune(&r, scan); } break; case PRINT : for (len = chartorune(&r, scan) ; r != 0 && (UNICODE_isprint(r) || UNICODE_isspace(r)) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; case NPRINT : for (len = chartorune(&r, scan) ; r != 0 && !(UNICODE_isprint(r) || UNICODE_isspace(r)) ; ) { count++; scan += len; len = chartorune(&r, scan); } break; default: /* Oh dear. Called inappropriately. */ *errp = URE_ERR_EXECUTION_BUG; count = 0; /* Best compromise. */ break; } tp->t_input = scan; return count; } /* return 1 if t_input is at the beginning of a word */ static int IsBegWord(tummy_t *tp) { Rune r; (void) chartorune(&r, tp->t_input); if (UNICODE_IsIdent(r)) { if (tp->t_input == tp->t_bol) { return 1; } (void) priorrune(&r, tp->t_input); if (!UNICODE_IsIdent(r)) { return 1; } } return 0; } /* return 1 if t_input is at the end of a word */ static int IsEndWord(tummy_t *tp) { Rune r; (void) chartorune(&r, tp->t_input); if (!UNICODE_IsIdent(r)) { (void) priorrune(&r, tp->t_input); if (tp->t_input != tp->t_bol && UNICODE_IsIdent(r)) { return 1; } } return 0; } /* return 1 if t_input is at the beginning of a line */ static int IsBegLine(tummy_t *tp) { Rune r; if (tp->t_input == tp->t_bol) { return 1; } (void) priorrune(&r, tp->t_input); return (r == '\n'); } /* compare, case-insensitively, a rune array to a utf string */ static int runeutfncasecmp(Rune *rp, char *up, int len) { Rune r; int diff; int rc; int i; for (diff = rc = 0 ; rc < len ; rc++) { i = chartorune(&r, up); up += i; if ((diff = UNICODE_tolower(r) - UNICODE_tolower(rp[rc])) != 0) { break; } } return diff; } /* - ureMatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ static int /* 0 failure, 1 success */ ureMatch(Rune *prog, int flags, int *errp, tummy_t *tp) { register Rune *scan; /* Current node. */ Rune *next; /* Next node. */ Rune r; int i; for (scan = prog; scan != (Rune *) NULL ; ) { next = ureNext(scan, 0); switch (OP(scan)) { case BOL: if (!IsBegLine(tp)) { return 0; } break; case EOL: i = chartorune(&r, tp->t_input); if (r != '\0' && r != '\n') { return 0; } break; case BEGWORD: /* Match if current char is the beginning of a word */ if (!IsBegWord(tp)) { return 0; } break; case ENDWORD: if (!IsEndWord(tp)) { return 0; } break; case BORDER: if (!IsBegWord(tp) && !IsEndWord(tp)) { return 0; } break; case NBORDER: if (IsBegWord(tp) || IsEndWord(tp)) { return 0; } break; case WHITESP: /* match single whitespace */ i = chartorune(&r, tp->t_input); if (r != 0 && !isspace(r)) { return 0; } tp->t_input += i; break; case NWHITESP: /* don't match eol, or space or tab */ i = chartorune(&r, tp->t_input); if (r == 0 || isspace(r)) { return 0; } tp->t_input += i; break; case ALNUM: /* includes _ */ i = chartorune(&r, tp->t_input); if (r == 0 || !UNICODE_IsIdent(r)) { return 0; } tp->t_input += i; break; case NALNUM: i = chartorune(&r, tp->t_input); if (r == 0 || UNICODE_IsIdent(r)) { return 0; } tp->t_input += i; break; case DIGIT: i = chartorune(&r, tp->t_input); if (r == 0 || !UNICODE_isdigit(r)) { return 0; } tp->t_input += i; break; case NDIGIT: i = chartorune(&r, tp->t_input); if (r == 0 || UNICODE_isdigit(r)) { return 0; } tp->t_input += i; break; case PRINT: i = chartorune(&r, tp->t_input); if (r == 0 || !(isprint(r) || isspace(r))) { return 0; } tp->t_input += i; break; case NPRINT: i = chartorune(&r, tp->t_input); if (r == 0 || isprint(r) || isspace(r)) { return 0; } tp->t_input += i; break; case ANYNONL: i = chartorune(&r, tp->t_input); if (r == '\0' || r == '\n') { return 0; } tp->t_input += i; break; case ANY: i = chartorune(&r, tp->t_input); if (r == '\0') { return 0; } tp->t_input += i; break; case EXACTLY: { register Rune *opnd; register int len; opnd = OPERAND(scan); /* Inline the first character, for speed. */ if (flags & URE_ICASE) { i = chartorune(&r, tp->t_input); if (UNICODE_tolower(*opnd) != UNICODE_tolower(r)) { return 0; } if ((len = UNICODE_strlen(opnd)) > 1 && runeutfncasecmp(opnd, tp->t_input, len) != 0) { return 0; } } else { i = chartorune(&r, tp->t_input); if (*opnd != r) { return 0; } if ((len = UNICODE_strlen(opnd)) > 1 && runeutfncmp(opnd, tp->t_input, len) != 0) { return 0; } } while (len-- > 0) { i = chartorune(&r, tp->t_input); tp->t_input += i; } } break; case ANYOF: i = chartorune(&r, tp->t_input); if (flags & URE_ICASE) { if (!MatchClass(OPERAND(scan), UNICODE_tolower(r)) && !MatchClass(OPERAND(scan), UNICODE_toupper(r))) { return 0; } } else { if (!MatchClass(OPERAND(scan), r)) { return 0; } } tp->t_input += i; break; case ANYBUT: i = chartorune(&r, tp->t_input); if (flags & URE_ICASE) { if (MatchClass(OPERAND(scan), UNICODE_tolower(r)) && MatchClass(OPERAND(scan), UNICODE_toupper(r))) { return 0; } } else { if (MatchClass(OPERAND(scan), r)) { return 0; } } tp->t_input += i; break; case NOTHING: break; case BACK: break; case BRANCH: { register char *save; int c; if (OP(next) != BRANCH) { /* No choice. */ next = OPERAND(scan); /* Avoid recursion. */ } else { do { save = tp->t_input; if ((c = ureMatch(OPERAND(scan), flags, errp, tp)) == 1 || c == -1) { return c; } tp->t_input = save; scan = ureNext(scan, 0); } while (scan != NULL && OP(scan) == BRANCH); return 0; /* NOTREACHED */ } } break; case STAR: case PLUS: { register Rune nextch; register char *save; register char *cp; register int min; register int num; int j; int c; /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ nextch = '\0'; if (OP(next) == EXACTLY) { nextch = *OPERAND(next); } min = (OP(scan) == STAR) ? 0 : 1; save = tp->t_input; num = ureRepeat(OPERAND(scan), errp, tp); if (num < 0) { return -1; } while (num >= min) { /* If it could work, try it. */ i = chartorune(&r, tp->t_input); if (nextch == '\0' || r == nextch) { if ((c = ureMatch(next, flags, errp, tp)) == 1 || c == -1) { return c; } } /* Couldn't or didn't -- back up. */ for (cp = save, j = 0, --num ; j < num ; j++, cp += i) { i = chartorune(&r, cp); } tp->t_input = cp; } return 0; } break; case MINMAX: { register char *save; register int no; Rune min; Rune max; int c; next = OPERAND(scan); min = OP(next); next = OPERAND(next); max = OP(next); next = OPERAND(next); save = tp->t_input; for (no = 0 ; no < min ; no++) { c = ureMatch(next, flags, errp, tp); if (c == 0 || c == -1) { tp->t_input = save; return c; } } for ( ; no < max ; no++) { c = ureMatch(next, flags, errp, tp); if (c == -1) { return -1; } if (c == 0) { break; } } return 1; } break; case END: return 1; /* Success! */ default: if (OP(scan) & OPEN) { register char *save; register int no; int c; no = OP(scan) - OPEN; save = tp->t_input; if ((c = ureMatch(next, flags, errp, tp)) == 1) { /* * Don't set startp if some later * invocation of the same parentheses * already has. */ if (tp->t_matchv[no].rm_so == -1) { tp->t_matchv[no].rm_so = tp->t_off + (save - tp->t_start); } return 1; } return c; } if (OP(scan) & CLOSE) { register char *save; register int no; int c; no = OP(scan) - CLOSE; save = tp->t_input; if ((c = ureMatch(next, flags, errp, tp)) == 1) { /* * Don't set endp if some later * invocation of the same parentheses * already has. */ if (tp->t_matchv[no].rm_eo == -1) { tp->t_matchv[no].rm_eo = tp->t_off + (save - tp->t_start); } return 1; } return c; } *errp = URE_ERR_MEMORY_CORRUPTION; return -1; } scan = next; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ *errp = URE_ERR_CORRUPTED_POINTERS; return -1; } /* - ureTry - try match at specific point */ static int /* 0 failure, 1 success */ ureTry(ure_t *prog, tummy_t *tp, int flags, int *errp) { register int i; for (i = 0 ; i < tp->t_matchc ; i++) { tp->t_matchv[i].rm_so = tp->t_matchv[i].rm_eo = (ureoff_t) -1; } if (ureMatch(prog->u_prog + 1, flags, errp, tp)) { if (tp->t_matchc > 0) { tp->t_matchv[0].rm_so = (ureoff_t) tp->t_off; tp->t_matchv[0].rm_eo = (ureoff_t) tp->t_off + (tp->t_input - tp->t_start); } return 1; } return 0; } /* - ureexec - match a URE against a string */ int ureexec(ure_t *up, char *string, int matchc, urematch_t *matchv, int eflags, char *collseq) { register char *s; tummy_t t; Rune r; char str[20]; int err; int cc; int i; /* Be paranoid... */ if (up == (ure_t *) NULL || string == (char *) NULL) { return URE_ERR_NULL_PARAM; } /* Check validity of program. */ if (UCHARAT(up->u_prog) != URE_MAGIC) { return URE_ERR_BAD_MAGIC; } /* sort out our language */ urecollseq(collseq); /* * if we were compiled ignoring case-sensitivity, add * case-insensitivity at run-time */ if ((up->u_flags & URE_FLAG_BITS) & URE_ICASE) { eflags |= URE_ICASE; } /* funny range to search */ if ((eflags & URE_FLAG_BITS) & URE_STARTEND) { string += matchv[0].rm_so; /* ends at matchv[0].rm_eo; */ } /* If there is a "must appear" string, look for it. */ if (up->u_must != NULL) { if ((eflags & URE_FLAG_BITS) & URE_ICASE) { s = string; r = UNICODE_toupper(up->u_must[0]); cc = runetochar(str, &r); r = UNICODE_tolower(up->u_must[0]); cc += runetochar(&str[cc], &r); r = 0; (void) runetochar(&str[cc], &r); while ((s = utffindrune(s, str)) != NULL) { if (runeutfncasecmp(up->u_must, s, up->u_mlen) == 0) { break; /* Found it. */ } i = chartorune(&r, s); s += i; } if (s == NULL) /* Not present. */ return URE_NOMATCH; } else { s = string; while ((s = utfrune(s, up->u_must[0])) != NULL) { if (runeutfncmp(up->u_must, s, up->u_mlen) == 0) { break; /* Found it. */ } i = chartorune(&r, s); s += i; } if (s == NULL) /* Not present. */ return URE_NOMATCH; } } /* Mark beginning of line for ^ . */ t.t_bol = ((eflags & URE_FLAG_BITS) & URE_NOTBOL) ? (char *) NULL : string; t.t_matchc = matchc; t.t_matchv = matchv; /* Simplest case: try anchored match for each beginning of line */ if (up->u_anch) { t.t_off = 0; t.t_start = t.t_input = s = string; if (*s == 0) { /* don't match ^ with 0 byte */ return URE_NOMATCH; } if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) { return URE_SUCCESS; } while ((s = utfrune(s, '\n')) != (char *) NULL) { s += chartorune(&r, s); if (*s == 0) { /* don't match ^ with 0 byte */ return URE_NOMATCH; } t.t_bol = s; t.t_off = s - string; t.t_input = t.t_start = s; if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) { return URE_SUCCESS; } } return URE_NOMATCH; } /* Messy cases: unanchored match. */ s = string; if (up->u_start != '\0') { if ((eflags & URE_FLAG_BITS) & URE_ICASE) { /* We know what char it must start with. */ r = UNICODE_toupper(up->u_start); cc = runetochar(str, &r); r = UNICODE_tolower(up->u_start); cc += runetochar(&str[cc], &r); r = 0; (void) runetochar(&str[cc], &r); while ((s = utffindrune(s, str)) != NULL) { t.t_off = s - string; t.t_input = t.t_start = s; if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) { return URE_SUCCESS; } i = chartorune(&r, s); s += i; if (r == '\n') { t.t_bol = s; } } } else { /* We know what char it must start with. */ while ((s = utfrune(s, up->u_start)) != NULL) { t.t_off = s - string; t.t_input = t.t_start = s; if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) { return URE_SUCCESS; } i = chartorune(&r, s); s += i; if (r == '\n') { t.t_bol = s; } } } } else { /* We don't -- general case. */ do { t.t_off = s - string; t.t_input = t.t_start = s; if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) { return URE_SUCCESS; } i = chartorune(&r, s); s += i; if (r == '\n') { t.t_bol = s; } } while (r != 0); } /* Failure. */ return URE_NOMATCH; } /* Textual error messages */ static char *ureErrMsgs[] = { /* URE_SUCCESS */ "No error", /* URE_NOMATCH */ "No matches found", /* URE_ERR_UNMATCHED_BRACKET */ "Unmatched []", /* URE_ERR_INTERNAL_MESS */ "internal urp", /* URE_ERR_REPEAT_FOLLOWS_NOTHING */ "?+*{ follows nothing", /* URE_ERR_TRAILING_BACKSLASH */ "trailing \\", /* URE_ERR_INTERNAL_DISASTER */ "internal disaster", /* URE_ERR_OPERAND_EMPTY */ "*+{ operand could be empty", /* URE_ERR_NESTED_REPEAT */ "nested *?+{", /* URE_ERR_TOO_MANY_PARENS */ "too many ()", /* URE_ERR_UNMATCHED_PAREN */ "unmatched ()", /* URE_ERR_JUNK_ON_END */ "junk on end", /* URE_ERR_NULL_ARG */ "NULL argument", /* URE_ERR_TOO_BIG */ "URE too big", /* URE_ERR_EXECUTION_BUG */ "internal execution problems", /* URE_ERR_MEMORY_CORRUPTION */ "memory corruption", /* URE_ERR_CORRUPTED_POINTERS */ "corrupted pointers", /* URE_ERR_NULL_PARAM */ "NULL parameter", /* URE_ERR_BAD_MAGIC */ "corrupted program", /* URE_ERR_CORRUPTED_OPCODE */ "corrupted opcode", /* URE_ERR_SUBST_NULL_ARG */ "NULL parm to URE_Subst", /* URE_ERR_SUBST_BAD_MAGIC */ "damaged URE in URE_Subst", /* URE_ERR_SUBST_DAMAGED_MATCH */ "damaged match string", /* URE_ERR_OUT_OF_SPACE */ "out of space", /* URE_ERR_UNKNOWN */ "unknown error" }; /* - ureerror - the interface to errors */ int ureerror(int errnum, ure_t *up, char *buf, int size) { char *s; int len; if (errnum < 0 || errnum > URE_ERR_OUT_OF_SPACE) { errnum = URE_ERR_UNKNOWN; } s = ureErrMsgs[errnum]; len = strlen(s) + 1; if (size > 0) { if (size > len) (void) strcpy(buf, s); else { (void) strncpy(buf, s, size-1); buf[size-1] = '\0'; } } return len; } /* Free the space we allocated */ void urefree(ure_t *up) { (void) free(up->u_prog); }