Bug 332173 - Problems with regexp parsing of '~' in nsIZipReader.findEntries (and other nsWildCard uses). Patch by Nelson Bolyard <nelson@bolyard.me>, r=jwalden, a=ss

git-svn-id: svn://10.0.0.236/trunk@257851 18797224-902f-48f8-a5cc-f745e15eee43
2009-07-29 21:12:47 +00:00 · 2009-07-29 21:12:47 +00:00 · 15334bed7b
commit 15334bed7b
parent ea17f6ca1a
3 changed files with 323 additions and 215 deletions
--- a/mozilla/modules/libjar/nsIZipReader.idl
+++ b/mozilla/modules/libjar/nsIZipReader.idl
@ -18,7 +18,7 @@
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
+ * Portions created by the Initial Developer are Copyright (C) 1998-2009
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
@ -27,6 +27,7 @@
 *   Samir Gehani <sgehani@netscape.com>
 *   Mitch Stoltz <mstoltz@netscape.com>
 *   Jeff Walden <jwalden+code@mit.edu>
+ *   Nelson Bolyard <nelson@bolyard.me>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -143,37 +144,35 @@ interface nsIZipReader : nsISupports
     * Returns a string enumerator containing the matching entry names.
     *
     * @param aPattern
-     *   A regular expression used to find matching entries in the zip file.
+     *   A globbing pattern used to find matching names in the zip file.
     *   Set this parameter to null to get all entries; otherwise, use the
     *   following syntax:
     *
     *   o * matches anything
     *   o ? matches one character
     *   o $ matches the end of the string
-     *   o [abc] matches one occurrence of a, b, or c. The only character that
-     *           must be escaped inside the brackets is ].  ^ and - must never
-     *           appear in the first and second positions within the brackets, 
-     *           respectively.  (In the former case, the behavior specified for
-     *           '[^az]' will happen.)
-     *   o [a-z] matches any character between a and z.  The characters a and z
-     *           must either both be letters or both be numbers, with the
-     *           character represented by 'a' having a lower ASCII value than
-     *           the character represented by 'z'.
-     *   o [^az] matches any character except a or z.  If ] is to appear inside
-     *           the brackets as a character to not match, it must be escaped.
-     *   o pat~pat2 returns matches to the pattern 'pat' which do not also match
-     *              the pattern 'pat2'.  This may be used to perform filtering
-     *              upon the results of one pattern to remove all matches which
-     *              also match another pattern.  For example, because '*'
-     *              matches any string and '*z*' matches any string containing a
-     *              'z', '*~*z*' will match all strings except those containing
-     *              a 'z'.  Note that a pattern may not use '~' multiple times,
-     *              so a string such as '*~*z*~*y*' is not a valid pattern.
+     *   o [abc] matches one occurrence of a, b, or c. 
+     *   o [^az] matches any character except a or z.  Between brackets,
+     *           the only characters that must be escaped are \ and ].  
+     *   o [a-z] matches any character between a and z, inclusive.  
+     *           The a and z characters must be alphanumeric ASCII characters.
+     *           If one is upper case and one is lower case, then the ASCII
+     *           non-alphanumeric characters between Z and a will be in range.
+     *   o [^a-z] matches any character except those between a and z, inclusive.
+     *            These forms cannot be combined, e.g [a-gp-z] does not work.
+     *   o yes~no returns matches to the pattern 'yes' that do not also match
+     *            the pattern 'no'.  This may be used to filter the results
+     *            of one pattern to remove all matches of a second pattern.
+     *            Only the outer-most pattern may use this, and at most once.
+     *            For example: *~abc will match any string except abc .
     *   o (foo|bar) will match either the pattern foo or the pattern bar.
-     *               Neither of the patterns foo or bar may use the 'pat~pat2'
-     *               syntax described immediately above.
-     *   o \ will escape a special character.  Escaping is required for all
-     *       special characters unless otherwise specified.
+     *            At least one pipe and two inner patterns are required. 
+     *            More are allowed. These inner patterns may NOT use the 
+     *            'yes~no' syntax described immediately above, and may not 
+     *            contain patterns of this same (foo|bar) form.
+     *   o \ will escape a special character.  To treat special characters as
+     *            ordinary matching characters, escaping is required for all
+     *            special characters, unless otherwise specified above.
     *   o All other characters match case-sensitively.
     *
     *   An aPattern not conforming to this syntax has undefined behavior.
--- a/mozilla/modules/libjar/nsWildCard.cpp
+++ b/mozilla/modules/libjar/nsWildCard.cpp
@ -1,4 +1,3 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
@ -16,10 +15,13 @@
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
+ * Portions created by the Initial Developer are Copyright (C) 1998-2009
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
+ *      Rob McCool  (original author)
+ *      Ken Key <key+mozilla@ksquared.net>
+ *      Nelson Bolyard <nelson@bolyard.me>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -36,7 +38,6 @@
 * ***** END LICENSE BLOCK ***** */

 /* *
- * 
 *
 * nsWildCard.cpp: shell-like wildcard match routines
 *
@ -44,7 +45,7 @@
 * a description of the syntax supported by the routines in this file.
 *
 * Rob McCool
- * 
+ *
 */

 #include "nsWildCard.h"
@ -54,82 +55,85 @@
 /* ----------------------------- _valid_subexp ------------------------------ */


-static int 
-_valid_subexp(char *expr, char stop) 
+static int
+_valid_subexp(const char *expr, char stop1, char stop2)
 {
-    register int x,y,t;
-    int nsc,np,tld;
+    register int x;
+    int nsc = 0;     /* Number of special characters */
+    int np;          /* Number of pipe characters in union */
+    int tld = 0;     /* Number of tilde characters */

-    x=0;nsc=0;tld=0;
-
-    while(expr[x] && (expr[x] != stop)) {
+    for (x = 0; expr[x] && (expr[x] != stop1) && (expr[x] != stop2); ++x) {
        switch(expr[x]) {
-          case '~':
-            if(tld) return INVALID_SXP;
-            else ++tld;
-          case '*':
-          case '?':
-          case '^':
-          case '$':
+        case '~':
+            if(tld)                 /* at most one exclusion */
+                return INVALID_SXP;
+            if (stop1)              /* no exclusions within unions */
+                return INVALID_SXP;
+            if (!expr[x+1])          /* exclusion cannot be last character */
+                return INVALID_SXP;
+            if (!x)                 /* exclusion cannot be first character */
+                return INVALID_SXP;
+            ++tld;
+            /* fall through */
+        case '*':
+        case '?':
+        case '$':
            ++nsc;
            break;
-          case '[':
+        case '[':
            ++nsc;
            if((!expr[++x]) || (expr[x] == ']'))
                return INVALID_SXP;
-            for(;expr[x] && (expr[x] != ']');++x)
-                if(expr[x] == '\\')
-                    if(!expr[++x])
-                        return INVALID_SXP;
+            for(; expr[x] && (expr[x] != ']'); ++x) {
+                if(expr[x] == '\\' && !expr[++x])
+                    return INVALID_SXP;
+            }
            if(!expr[x])
                return INVALID_SXP;
            break;
-          case '(':
-            ++nsc;np = 0;
-            while(1) {
-                if(expr[++x] == ')')
-                    return INVALID_SXP;
-                for(y=x;(expr[y]) && (expr[y] != '|') && (expr[y] != ')');++y)
-                    if(expr[y] == '\\')
-                        if(!expr[++y])
-                            return INVALID_SXP;
-                if(!expr[y])
-                    return INVALID_SXP;
-                if(expr[y] == '|')
-                    ++np;
-                t = _valid_subexp(&expr[x],expr[y]);
-                if(t == INVALID_SXP)
+        case '(':
+            ++nsc;
+            if (stop1)                  /* no nested unions */
+                return INVALID_SXP;
+            np = -1;
+            do {
+                int t = _valid_subexp(&expr[++x], ')', '|');
+                if(t == 0 || t == INVALID_SXP)
                    return INVALID_SXP;
                x+=t;
-                if(expr[x] == ')') {
-                    if(!np)
-                        return INVALID_SXP;
-                    break;
-                }
-            }
+                if(!expr[x])
+                    return INVALID_SXP;
+                ++np;
+            } while (expr[x] == '|' );
+            if(np < 1)  /* must be at least one pipe */
+                return INVALID_SXP;
            break;
-          case ')':
-          case ']':
+        case ')':
+        case ']':
+        case '|':
            return INVALID_SXP;
-          case '\\':
+        case '\\':
+            ++nsc;
            if(!expr[++x])
                return INVALID_SXP;
-          default:
+            break;
+        default:
            break;
        }
-        ++x;
    }
-    if((!stop) && (!nsc))
+    if((!stop1) && (!nsc)) /* must be at least one special character */
        return NON_SXP;
-    return ((expr[x] == stop) ? x : INVALID_SXP);
+    return ((expr[x] == stop1 || expr[x] == stop2) ? x : INVALID_SXP);
 }

-int 
-NS_WildCardValid(char *expr) 
+
+int
+NS_WildCardValid(const char *expr)
 {
    int x;

-    x = _valid_subexp(expr, '\0');
+    x = _valid_subexp(expr, '\0', '\0');
    return (x < 0 ? x : VALID_SXP);
 }

@ -141,160 +145,263 @@ NS_WildCardValid(char *expr)
 #define NOMATCH 1
 #define ABORTED -1

-static int _shexp_match(char *str, char *expr, PRBool case_insensitive);
+static int
+_shexp_match(const char *str, const char *expr, PRBool case_insensitive,
+             unsigned int level);

-static int 
-_handle_union(char *str, char *expr, PRBool case_insensitive) 
+/* Count characters until we reach a NUL character or either of the
+ * two delimiter characters, stop1 or stop2.  If we encounter a bracketed
+ * expression, look only for NUL or ']' inside it.  Do not look for stop1
+ * or stop2 inside it. Return ABORTED if bracketed expression is unterminated.
+ * Handle all escaping.
+ * Return index in input string of first stop found, or ABORTED if not found.
+ * If "dest" is non-NULL, copy counted characters to it and NUL terminate.
+ */
+static int
+_scan_and_copy(const char *expr, char stop1, char stop2, char *dest)
 {
-    char *e2 = (char *) PR_Malloc(sizeof(char)*strlen(expr));
-    register int t,p2,p1 = 1;
-    int cp;
+    register int sx;     /* source index */
+    register char cc;

-    while(1) {
-        for(cp=1;expr[cp] != ')';cp++)
-            if(expr[cp] == '\\')
-                ++cp;
-        for(p2 = 0;(expr[p1] != '|') && (p1 != cp);p1++,p2++) {
-            if(expr[p1] == '\\')
-                e2[p2++] = expr[p1++];
-            e2[p2] = expr[p1];
+    for (sx = 0; (cc = expr[sx]) && cc != stop1 && cc != stop2; sx++) {
+        if (cc == '\\') {
+            if (!expr[++sx])
+                return ABORTED; /* should be impossible */
        }
-        for (t=cp+1; ((e2[p2] = expr[t]) != 0); ++t,++p2) {}
-        if(_shexp_match(str,e2, case_insensitive) == MATCH) {
-            PR_Free(e2);
-            return MATCH;
+        else if (cc == '[') {
+            while ((cc = expr[++sx]) && cc != ']') {
+                if(cc == '\\' && !expr[++sx])
+                    return ABORTED;
+            }
+            if (!cc)
+                return ABORTED; /* should be impossible */
        }
-        if(p1 == cp) {
-            PR_Free(e2);
-            return NOMATCH;
-        }
-        else ++p1;
    }
+    if (dest && sx) {
+        /* Copy all but the closing delimiter. */
+        memcpy(dest, expr, sx);
+        dest[sx] = 0;
+    }
+    return cc ? sx : ABORTED; /* index of closing delimiter */
 }

-
-static int 
-_shexp_match(char *str, char *expr, PRBool case_insensitive) 
+/* On input, expr[0] is the opening parenthesis of a union.
+ * See if any of the alternatives in the union matches as a pattern.
+ * The strategy is to take each of the alternatives, in turn, and append
+ * the rest of the expression (after the closing ')' that marks the end of
+ * this union) to that alternative, and then see if the resultant expression
+ * matches the input string.  Repeat this until some alternative matches,
+ * or we have an abort.
+ */
+static int
+_handle_union(const char *str, const char *expr, PRBool case_insensitive,
+              unsigned int level)
 {
-    register int x,y;
+    register int sx;     /* source index */
+    int cp;              /* source index of closing parenthesis */
+    int count;
+    int ret   = NOMATCH;
+    char *e2;
+
+    /* Find the closing parenthesis that ends this union in the expression */
+    cp = _scan_and_copy(expr, ')', '\0', NULL);
+    if (cp == ABORTED || cp < 4) /* must be at least "(a|b" before ')' */
+        return ABORTED;
+    ++cp;                /* now index of char after closing parenthesis */
+    e2 = (char *) PR_Malloc(1 + strlen(expr));
+    if (!e2)
+        return ABORTED;
+    for (sx = 1; ret == NOMATCH && expr[sx] && expr[sx] != ')'; ++sx) {
+        /* Here, expr[sx] is one character past the preceeding '(' or '|'. */
+        /* Copy everything up to the next delimiter to e2 */
+        count = _scan_and_copy(expr + sx, ')', '|', e2);
+        if (count == ABORTED || !count) {
+            ret = ABORTED;
+            break;
+        }
+        sx += count;
+        /* Append everything after closing parenthesis to e2. This is safe. */
+        strcpy(e2+count, expr+cp);
+        ret = _shexp_match(str, e2, case_insensitive, level + 1);
+    }
+    PR_Free(e2);
+    if (sx < 2)
+        ret = ABORTED;
+    return ret;
+}
+
+/* returns 1 if val is in range from start..end, case insensitive. */
+static int
+_is_char_in_range(int start, int end, int val)
+{
+    char map[256];
+    memset(map, 0, sizeof map);
+    while (start <= end)
+        map[tolower(start++)] = 1;
+    return map[tolower(val)];
+}
+
+static int
+_shexp_match(const char *str, const char *expr, PRBool case_insensitive,
+             unsigned int level)
+{
+    register int x;   /* input string index */
+    register int y;   /* expression index */
    int ret,neg;

-    ret = 0;
-    for(x=0,y=0;expr[y];++y,++x) {
-        if((!str[x]) && (expr[y] != '(') && (expr[y] != '$') && (expr[y] != '*'))
-            ret = ABORTED;
-        else {
-            switch(expr[y]) {
-              case '$':
-                if( (str[x]) )
-                    ret = NOMATCH;
-                else
-                    --x;             /* we don't want loop to increment x */
-                break;
-              case '*':
-                while(expr[++y] == '*'){}
-                if(!expr[y])
+    if (level > 20)      /* Don't let the stack get too deep. */
+        return ABORTED;
+    for(x = 0, y = 0; expr[y]; ++y, ++x) {
+        if((!str[x]) && (expr[y] != '$') && (expr[y] != '*')) {
+            return NOMATCH;
+        }
+        switch(expr[y]) {
+        case '$':
+            if(str[x])
+                return NOMATCH;
+            --x;                 /* we don't want loop to increment x */
+            break;
+        case '*':
+            while(expr[++y] == '*'){}
+            if(!expr[y])
+                return MATCH;
+            while(str[x]) {
+                ret = _shexp_match(&str[x++], &expr[y], case_insensitive,
+                                   level + 1);
+                switch(ret) {
+                case NOMATCH:
+                    continue;
+                case ABORTED:
+                    return ABORTED;
+                default:
                    return MATCH;
-                while(str[x]) {
-                    switch(_shexp_match(&str[x++],&expr[y], case_insensitive)) {
-                    case NOMATCH:
-                        continue;
-                    case ABORTED:
-                        ret = ABORTED;
-                        break;
-                    default:
-                        return MATCH;
-                    }
-                    break;
                }
-                if((expr[y] == '$') && (expr[y+1] == '\0') && (!str[x]))
-                    return MATCH;
-                else
-                    ret = ABORTED;
-                break;
-              case '[':
-              	neg = ((expr[++y] == '^') && (expr[y+1] != ']'));
-                if (neg)
-                    ++y;
-                
-                if ((isalnum(expr[y])) && (expr[y+1] == '-') && 
-                   (isalnum(expr[y+2])) && (expr[y+3] == ']'))
-                    {
-                        int start = expr[y], end = expr[y+2];
-                        
-                        /* Droolproofing for pinheads not included */
-                        if(neg ^ ((str[x] < start) || (str[x] > end))) {
-                            ret = NOMATCH;
-                            break;
-                        }
-                        y+=3;
+            }
+            if((expr[y] == '$') && (expr[y+1] == '\0') && (!str[x]))
+                return MATCH;
+            else
+                return NOMATCH;
+        case '[': {
+            int start, end = 0, i;
+            neg = ((expr[++y] == '^') && (expr[y+1] != ']'));
+            if (neg)
+                ++y;
+            i = y;
+            start = (unsigned char)(expr[i++]);
+            if (start == '\\')
+                start = (unsigned char)(expr[i++]);
+            if (isalnum(start) && expr[i++] == '-') {
+                end = (unsigned char)(expr[i++]);
+                if (end == '\\')
+                    end = (unsigned char)(expr[i++]);
+            }
+            if (isalnum(end) && expr[i] == ']') {
+                /* This is a range form: a-b */
+                int val   = (unsigned char)(str[x]);
+                if (end < start) { /* swap them */
+                    int tmp = end;
+                    end = start;
+                    start = tmp;
+                }
+                if (case_insensitive && isalpha(val)) {
+                    val = _is_char_in_range(start, end, val);
+                    if (neg == val)
+                        return NOMATCH;
+                }
+                else if (neg != ((val < start) || (val > end))) {
+                    return NOMATCH;
+                }
+                y = i;
+            }
+            else {
+                /* Not range form */
+                int matched = 0;
+                for (; expr[y] != ']'; y++) {
+                    if (expr[y] == '\\')
+                        ++y;
+                    if(case_insensitive) {
+                        matched |= (toupper(str[x]) == toupper(expr[y]));
                    }
-                else {
-                    int matched;
-                    
-                    for (matched=0;expr[y] != ']';y++) {
-                        /* match an escaped ']' character */
-                        if('\\' == expr[y] && ']' == expr[y+1]) {
-                            if(']' == str[x])
-                                matched |= 1;
-                            y++; /* move an extra char to compensate for '\\' */
-                            continue;
-                        }
+                    else {
                        matched |= (str[x] == expr[y]);
                    }
-                    if (neg ^ (!matched))
-                        ret = NOMATCH;
                }
-                break;
-              case '(':
-                return _handle_union(&str[x],&expr[y], case_insensitive);
-                break;
-              case '?':
-                break;
-              case '\\':
-                ++y;
-              default:
-				if(case_insensitive)
-				  {
-                    if(toupper(str[x]) != toupper(expr[y]))
-                        ret = NOMATCH;
-				  }
-				else
-				  {
-                    if(str[x] != expr[y])
-                        ret = NOMATCH;
-				  }
-                break;
+                if (neg == matched)
+                    return NOMATCH;
            }
        }
-        if(ret)
+        break;
+        case '(':
+            if (!expr[y+1])
+                return ABORTED;
+            return _handle_union(&str[x], &expr[y], case_insensitive, level);
+        case '?':
            break;
-    }
-    return (ret ? ret : (str[x] ? NOMATCH : MATCH));
-}
-
-int 
-NS_WildCardMatch(char *str, char *xp, PRBool case_insensitive) {
-    register int x;
-    char *expr = PL_strdup(xp);
-
-	if(!expr)
-		return 1;
-
-    for(x=strlen(expr)-1;x;--x) {
-        if((expr[x] == '~') && (expr[x-1] != '\\')) {
-            expr[x] = '\0';
-            if(_shexp_match(str,&expr[++x], case_insensitive) == MATCH)
-                goto punt;
+        case ')':
+        case ']':
+        case '|':
+            return ABORTED;
+        case '\\':
+            ++y;
+            /* fall through */
+        default:
+            if(case_insensitive) {
+                if(toupper(str[x]) != toupper(expr[y]))
+                    return NOMATCH;
+            }
+            else {
+                if(str[x] != expr[y])
+                    return NOMATCH;
+            }
            break;
        }
    }
-    if(_shexp_match(str,expr, case_insensitive) == MATCH) {
-        PR_Free(expr);
-        return 0;
-    }
-
-  punt:
-    PR_Free(expr);
-    return 1;
+    return (str[x] ? NOMATCH : MATCH);
+}
+
+static int
+ns_WildCardMatch(const char *str, const char *xp, PRBool case_insensitive)
+{
+    char *expr = 0;
+    int x, ret = MATCH;
+
+    if (!strchr(xp, '~'))
+        return _shexp_match(str, xp, case_insensitive, 0);
+
+    expr = PL_strdup(xp);
+    if(!expr)
+        return NOMATCH;
+
+    x = _scan_and_copy(expr, '~', '\0', NULL);
+    if (x != ABORTED && expr[x] == '~') {
+        expr[x++] = '\0';
+        ret = _shexp_match(str, &expr[x], case_insensitive, 0);
+        switch (ret) {
+        case NOMATCH: ret = MATCH;   break;
+        case MATCH:   ret = NOMATCH; break;
+        default:                     break;
+        }
+    }
+    if (ret == MATCH)
+        ret = _shexp_match(str, expr, case_insensitive, 0);
+
+    PR_Free(expr);
+    return ret;
+}
+
+
+int
+NS_WildCardMatch(const char *str, const char *expr, PRBool case_insensitive)
+{
+    int is_valid = NS_WildCardValid(expr);
+    switch(is_valid) {
+        case INVALID_SXP:
+            return -1;
+        case NON_SXP:
+            if (case_insensitive)
+                return (PL_strcasecmp(expr,str) ? NOMATCH : MATCH);
+            return (strcmp(expr,str) ? NOMATCH : MATCH);
+        default:
+            return ns_WildCardMatch(str, expr, case_insensitive);
+    }
 }
--- a/mozilla/modules/libjar/nsWildCard.h
+++ b/mozilla/modules/libjar/nsWildCard.h
@ -16,10 +16,11 @@
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
+ * Portions created by the Initial Developer are Copyright (C) 1998-2009
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
+ *      Nelson Bolyard <nelson@bolyard.me>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
@ -71,7 +72,7 @@
 #define INVALID_SXP -2
 #define VALID_SXP 1

-extern int NS_WildCardValid(char *expr);
+extern int NS_WildCardValid(const char *expr);


 /* return values for the search routines */
@ -87,6 +88,7 @@ extern int NS_WildCardValid(char *expr);
 * Returns 0 on match and 1 on non-match.
 */

-extern int NS_WildCardMatch(char *str, char *expr, PRBool case_insensitive);
+extern int 
+NS_WildCardMatch(const char *str, const char *expr, PRBool case_insensitive);

 #endif /* nsWildCard_h__ */