diff --git a/mozilla/js2/src/bytecodecontainer.cpp b/mozilla/js2/src/bytecodecontainer.cpp index 6c100dc6adf..2545691eecb 100644 --- a/mozilla/js2/src/bytecodecontainer.cpp +++ b/mozilla/js2/src/bytecodecontainer.cpp @@ -49,11 +49,12 @@ #include #include -#include "regexp.h" +#include "strings.h" #include "reader.h" #include "parser.h" #include "js2engine.h" #include "bytecodecontainer.h" +#include "regexp.h" #include "js2metadata.h" diff --git a/mozilla/js2/src/epimetheus.cpp b/mozilla/js2/src/epimetheus.cpp index 0284ef18163..8ac8a3e2d0d 100644 --- a/mozilla/js2/src/epimetheus.cpp +++ b/mozilla/js2/src/epimetheus.cpp @@ -31,6 +31,7 @@ #include #include "world.h" +#include "strings.h" #include "utilities.h" #include "js2value.h" @@ -41,8 +42,8 @@ #include "reader.h" #include "parser.h" -#include "regexp.h" #include "js2engine.h" +#include "regexp.h" #include "bytecodecontainer.h" #include "js2metadata.h" diff --git a/mozilla/js2/src/exception.h b/mozilla/js2/src/exception.h index 7aef1714e5f..db4dd5b3e28 100644 --- a/mozilla/js2/src/exception.h +++ b/mozilla/js2/src/exception.h @@ -34,6 +34,8 @@ #ifndef exception_h___ #define exception_h___ +#include "systemtypes.h" +#include "js2value.h" #include "strings.h" namespace JavaScript @@ -72,6 +74,7 @@ namespace JavaScript size_t charNum; // Character offset within the line that caused the error size_t pos; // Offset within the input of the error String sourceLine; // The text of the source line + js2val value; // The value for a user exception Exception (Kind kind, const char *message): kind(kind), message(widenCString(message)), lineNum(0), charNum(0) {} @@ -89,6 +92,8 @@ namespace JavaScript kind(kind), message(message), sourceFile(sourceFile), lineNum(lineNum), charNum(charNum), pos(pos), sourceLine(sourceLineBegin, sourceLineEnd) {} + Exception(js2val v) : kind(userException), lineNum(0), charNum(0), value(v) {} + bool hasKind(Kind k) const {return kind == k;} const char *kindString() const; String fullMessage() const; diff --git a/mozilla/js2/src/js2array.cpp b/mozilla/js2/src/js2array.cpp index bc2f5427a31..27131a3ff13 100644 --- a/mozilla/js2/src/js2array.cpp +++ b/mozilla/js2/src/js2array.cpp @@ -172,7 +172,7 @@ static js2val Array_toSource(JS2Metadata *meta, const js2val thisValue, js2val * return meta->engine->allocString("[]"); else { js2val result; - String *s = new String(); + String *s = new String(widenCString("[")); for (uint32 i = 0; i < length; i++) { if (meta->arrayClass->readPublic(meta, &thatValue, meta->arrayClass, meta->engine->numberToString(i), RunPhase, &result) && !JS2VAL_IS_UNDEFINED(result)) @@ -769,8 +769,9 @@ void initArrayObject(JS2Metadata *meta) { NULL } }; + meta->initBuiltinClass(meta->arrayClass, NULL, Array_Constructor, Array_Constructor); meta->arrayClass->prototype = OBJECT_TO_JS2VAL(new ArrayInstance(meta, OBJECT_TO_JS2VAL(meta->objectClass->prototype), meta->arrayClass)); - meta->initBuiltinClass(meta->arrayClass, &prototypeFunctions[0], NULL, Array_Constructor, Array_Constructor); + meta->initBuiltinClassPrototype(meta->arrayClass, &prototypeFunctions[0]); } } diff --git a/mozilla/js2/src/js2boolean.cpp b/mozilla/js2/src/js2boolean.cpp index 66b05501c5e..d5557cd5c27 100644 --- a/mozilla/js2/src/js2boolean.cpp +++ b/mozilla/js2/src/js2boolean.cpp @@ -105,8 +105,9 @@ namespace MetaData { { NULL } }; + meta->initBuiltinClass(meta->booleanClass, NULL, Boolean_Constructor, Boolean_Call); meta->booleanClass->prototype = OBJECT_TO_JS2VAL(new BooleanInstance(meta, OBJECT_TO_JS2VAL(meta->objectClass->prototype), meta->booleanClass)); - meta->initBuiltinClass(meta->booleanClass, &prototypeFunctions[0], NULL, Boolean_Constructor, Boolean_Call); + meta->initBuiltinClassPrototype(meta->booleanClass, &prototypeFunctions[0]); } } diff --git a/mozilla/js2/src/js2date.cpp b/mozilla/js2/src/js2date.cpp index ef7dea2c49c..3ddb6a79007 100644 --- a/mozilla/js2/src/js2date.cpp +++ b/mozilla/js2/src/js2date.cpp @@ -1479,8 +1479,9 @@ void initDateObject(JS2Metadata *meta) LocalTZA = -(PRMJ_LocalGMTDifference() * msPerSecond); + meta->initBuiltinClass(meta->dateClass, &staticFunctions[0], Date_Constructor, Date_Call); meta->dateClass->prototype = OBJECT_TO_JS2VAL(new DateInstance(meta, OBJECT_TO_JS2VAL(meta->objectClass->prototype), meta->dateClass)); - meta->initBuiltinClass(meta->dateClass, &prototypeFunctions[0], &staticFunctions[0], Date_Constructor, Date_Call); + meta->initBuiltinClassPrototype(meta->dateClass, &prototypeFunctions[0]); } diff --git a/mozilla/js2/src/js2engine.cpp b/mozilla/js2/src/js2engine.cpp index 1fb7623bf6b..c79663561c6 100644 --- a/mozilla/js2/src/js2engine.cpp +++ b/mozilla/js2/src/js2engine.cpp @@ -150,8 +150,6 @@ namespace MetaData { else break; } while (true); - if (jsx.hasKind(Exception::userException)) // snatch the exception before the stack gets clobbered - x = pop(); activationStackTop = prev; // need the one before the target function to // be at the top, because of postincrement localFrame = activationStackTop->localFrame; @@ -159,12 +157,11 @@ namespace MetaData { bCon = activationStackTop->bCon; meta->env = activationStackTop->env; } - else { - if (jsx.hasKind(Exception::userException)) - x = pop(); - } // make sure there's a JS object for the catch clause to work with - if (!jsx.hasKind(Exception::userException)) { + if (jsx.hasKind(Exception::userException)) { + x = jsx.value; + } + else { js2val argv[1]; argv[0] = allocString(new String(jsx.fullMessage())); switch (jsx.kind) { diff --git a/mozilla/js2/src/js2error.cpp b/mozilla/js2/src/js2error.cpp index 5ce89ae0578..05bdc1f0f80 100644 --- a/mozilla/js2/src/js2error.cpp +++ b/mozilla/js2/src/js2error.cpp @@ -124,13 +124,10 @@ js2val Error_toString(JS2Metadata *meta, const js2val thisValue, js2val *argv, u static void initErrorClass(JS2Metadata *meta, JS2Class *c, Constructor *constructor) { - c->construct = constructor; + meta->initBuiltinClass(c, NULL, constructor, constructor); c->prototype = OBJECT_TO_JS2VAL(new SimpleInstance(meta, meta->errorClass->prototype, meta->errorClass)); - meta->createDynamicProperty(JS2VAL_TO_OBJECT(c->prototype), &meta->world.identifiers["name"], meta->engine->allocString(c->getName()), ReadAccess, true, true); meta->createDynamicProperty(JS2VAL_TO_OBJECT(c->prototype), &meta->world.identifiers["message"], meta->engine->allocString("Message"), ReadAccess, true, true); - - meta->initBuiltinClass(c, NULL, NULL, constructor, constructor); } void initErrorObject(JS2Metadata *meta) @@ -145,10 +142,11 @@ void initErrorObject(JS2Metadata *meta) NamespaceList publicNamespaceList; publicNamespaceList.push_back(meta->publicNamespace); + meta->initBuiltinClass(meta->errorClass, NULL, Error_Constructor, Error_Constructor); meta->errorClass->prototype = OBJECT_TO_JS2VAL(new SimpleInstance(meta, meta->objectClass->prototype, meta->errorClass)); meta->createDynamicProperty(JS2VAL_TO_OBJECT(meta->errorClass->prototype), &meta->world.identifiers["name"], meta->engine->allocString("Error"), ReadAccess, true, true); meta->createDynamicProperty(JS2VAL_TO_OBJECT(meta->errorClass->prototype), &meta->world.identifiers["message"], meta->engine->allocString("Message"), ReadAccess, true, true); - meta->initBuiltinClass(meta->errorClass, errorProtos, NULL, Error_Constructor, Error_Constructor); + meta->initBuiltinClassPrototype(meta->errorClass, errorProtos); initErrorClass(meta, meta->evalErrorClass, EvalError_Constructor); diff --git a/mozilla/js2/src/js2eval.cpp b/mozilla/js2/src/js2eval.cpp index f9473a58532..c581c8f6bc5 100644 --- a/mozilla/js2/src/js2eval.cpp +++ b/mozilla/js2/src/js2eval.cpp @@ -568,6 +568,31 @@ namespace MetaData { float64 d = convertValueToDouble(x); return JS2Engine::float64toInt32(d); } + + // x is not an int + uint32 JS2Metadata::convertValueToUInteger(js2val x) + { + uint32 i; + if (JS2VAL_IS_LONG(x)) { + JSLL_L2UI(i, *JS2VAL_TO_LONG(x)); + return i; + } + if (JS2VAL_IS_ULONG(x)) { + JSLL_UL2UI(i, *JS2VAL_TO_ULONG(x)); + return i; + } + if (JS2VAL_IS_FLOAT(x)) { + float64 f = *JS2VAL_TO_FLOAT(x); + return JS2Engine::float64toUInt32(f); + } + if (JS2VAL_IS_DOUBLE(x)) { + float64 d = *JS2VAL_TO_DOUBLE(x); + return JS2Engine::float64toUInt32(d); + } + float64 d = convertValueToDouble(x); + return JS2Engine::float64toUInt32(d); + } + bool defaultReadProperty(JS2Metadata *meta, js2val *base, JS2Class *limit, Multiname *multiname, LookupKind *lookupKind, Phase phase, js2val *rval) { diff --git a/mozilla/js2/src/js2function.cpp b/mozilla/js2/src/js2function.cpp index d044be8b715..7281e155c06 100644 --- a/mozilla/js2/src/js2function.cpp +++ b/mozilla/js2/src/js2function.cpp @@ -150,8 +150,9 @@ namespace MetaData { fnInst->fWrap = new FunctionWrapper(true, new ParameterFrame(JS2VAL_INACCESSIBLE, true), meta->env); fnInst->fWrap->bCon->emitOp(eReturnVoid, 0); + meta->initBuiltinClass(meta->functionClass, NULL, Function_Constructor, Function_Constructor); meta->functionClass->prototype = OBJECT_TO_JS2VAL(fnInst); - meta->initBuiltinClass(meta->functionClass, &prototypeFunctions[0], NULL, Function_Constructor, Function_Constructor); + meta->initBuiltinClassPrototype(meta->functionClass, &prototypeFunctions[0]); } } diff --git a/mozilla/js2/src/js2metadata.cpp b/mozilla/js2/src/js2metadata.cpp index 0ad0a9ca8b3..4548bad6267 100644 --- a/mozilla/js2/src/js2metadata.cpp +++ b/mozilla/js2/src/js2metadata.cpp @@ -50,8 +50,8 @@ #include "numerics.h" #include "reader.h" #include "parser.h" -#include "regexp.h" #include "js2engine.h" +#include "regexp.h" #include "bytecodecontainer.h" #include "js2metadata.h" @@ -1292,7 +1292,7 @@ namespace MetaData { bCon->emitOp(eHandler, p->pos); CatchClause *c = t->catches; // the exception object will be the only thing on the stack - ASSERT(bCon->mStackTop == 0); +// ASSERT(bCon->mStackTop == 0); bCon->mStackTop = 1; if (bCon->mStackMax < 1) bCon->mStackMax = 1; BytecodeContainer::LabelID nextCatch = NotALabel; @@ -2291,8 +2291,12 @@ doUnary: { RegExpExprNode *v = checked_cast(p); js2val args[2]; - args[0] = engine->allocString(v->re); - args[1] = engine->allocString(&v->flags); + const String *reStr = engine->allocStringPtr(&v->re); + DEFINE_ROOTKEEPER(rk1, reStr); + const String *flagStr = engine->allocStringPtr(&v->flags); + DEFINE_ROOTKEEPER(rk2, flagStr); + args[0] = STRING_TO_JS2VAL(reStr); + args[1] = STRING_TO_JS2VAL(flagStr); // XXX error handling during this parse? The RegExp_Constructor is // going to call errorPos() on the current bCon. js2val reValue = RegExp_Constructor(this, JS2VAL_NULL, args, 2); @@ -2552,20 +2556,19 @@ doUnary: if (b->op2->getKind() == ExprNode::identifier) { IdentifierExprNode *i = checked_cast(b->op2); -#if 0 + if (*exprType) { - MemberDescriptor m2; Multiname multiname(&i->name); - if (findLocalMember(*exprType, &multiname, ReadAccess, CompilePhase, &m2)) { - if (m2.ns) { - QualifiedName qname(m2.ns, multiname.name); - InstanceMember *m = findInstanceMember(*exprType, &qname, ReadAccess); - if (m->kind == InstanceMember::InstanceVariableKind) - returnRef = new SlotReference(checked_cast(m)->slotIndex); + InstanceMember *mBase = findBaseInstanceMember(*exprType, &multiname, ReadAccess); + if (mBase) { + InstanceMember *m = getDerivedInstanceMember(*exprType, mBase, ReadAccess); + if (m->memberKind == Member::InstanceVariableMember) { + InstanceVariable *mv = checked_cast(m); + returnRef = new (*referenceArena) SlotReference(mv->slotIndex); } } } -#endif + if (returnRef == NULL) { returnRef = new (*referenceArena) DotReference(&i->name); referenceArena->registerDestructor(returnRef); @@ -3473,9 +3476,6 @@ rescan: return JS2VAL_UNDEFINED; } -#define JS7_ISHEX(c) ((c) < 128 && isxdigit(c)) -#define JS7_UNHEX(c) (uint32)(isdigit(c) ? (c) - '0' : 10 + tolower(c) - 'a') - /* See ECMA-262 15.1.2.5 */ static js2val GlobalObject_unescape(JS2Metadata *meta, const js2val /* thisValue */, js2val argv[], uint32 argc) { @@ -4424,44 +4424,24 @@ XXX see EvalAttributeExpression, where identifiers are being handled for now... reportError(kind, message, pos, str.c_str()); } - - void JS2Metadata::initBuiltinClass(JS2Class *builtinClass, FunctionData *protoFunctions, FunctionData *staticFunctions, NativeCode *construct, NativeCode *call) + // Called after initBuiltinClass and after the prototype object is constructed + void JS2Metadata::initBuiltinClassPrototype(JS2Class *builtinClass, FunctionData *protoFunctions) { - FunctionData *pf; - - builtinClass->construct = construct; - builtinClass->call = call; - - // Adding "prototype" & "length", etc as static members of the class - not dynamic properties; XXX env->addFrame(builtinClass); { Variable *v = new Variable(builtinClass, OBJECT_TO_JS2VAL(builtinClass->prototype), true); defineLocalMember(env, engine->prototype_StringAtom, NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); - v = new Variable(builtinClass, INT_TO_JS2VAL(1), true); - defineLocalMember(env, engine->length_StringAtom, NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); - - pf = staticFunctions; - if (pf) { - while (pf->name) { - FunctionInstance *callInst = new FunctionInstance(this, functionClass->prototype, functionClass); - callInst->fWrap = new FunctionWrapper(true, new ParameterFrame(JS2VAL_INACCESSIBLE, true), pf->code, env); - v = new Variable(functionClass, OBJECT_TO_JS2VAL(callInst), true); - defineLocalMember(env, &world.identifiers[pf->name], NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); - createDynamicProperty(callInst, engine->length_StringAtom, INT_TO_JS2VAL(pf->length), ReadAccess, true, false); - pf++; - } - } - } + } env->removeTopFrame(); - + // Add "constructor" as a dynamic property of the prototype FunctionInstance *fInst = new FunctionInstance(this, functionClass->prototype, functionClass); createDynamicProperty(fInst, engine->length_StringAtom, INT_TO_JS2VAL(1), ReadAccess, true, false); - fInst->fWrap = new FunctionWrapper(true, new ParameterFrame(JS2VAL_INACCESSIBLE, true), construct, env); + fInst->fWrap = new FunctionWrapper(true, new ParameterFrame(JS2VAL_INACCESSIBLE, true), builtinClass->construct, env); ASSERT(JS2VAL_IS_OBJECT(builtinClass->prototype)); createDynamicProperty(JS2VAL_TO_OBJECT(builtinClass->prototype), &world.identifiers["constructor"], OBJECT_TO_JS2VAL(fInst), ReadWriteAccess, false, false); - pf = protoFunctions; + FunctionData *pf = protoFunctions; if (pf) { while (pf->name) { /* @@ -4479,6 +4459,35 @@ XXX see EvalAttributeExpression, where identifiers are being handled for now... } } } + + void JS2Metadata::initBuiltinClass(JS2Class *builtinClass, FunctionData *staticFunctions, NativeCode *construct, NativeCode *call) + { + FunctionData *pf; + + builtinClass->construct = construct; + builtinClass->call = call; + + // Adding "prototype" & "length", etc as static members of the class - not dynamic properties; XXX + env->addFrame(builtinClass); + { + Variable *v = new Variable(builtinClass, INT_TO_JS2VAL(1), true); + defineLocalMember(env, engine->length_StringAtom, NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); + + pf = staticFunctions; + if (pf) { + while (pf->name) { + FunctionInstance *callInst = new FunctionInstance(this, functionClass->prototype, functionClass); + callInst->fWrap = new FunctionWrapper(true, new ParameterFrame(JS2VAL_INACCESSIBLE, true), pf->code, env); + v = new Variable(functionClass, OBJECT_TO_JS2VAL(callInst), true); + defineLocalMember(env, &world.identifiers[pf->name], NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); + createDynamicProperty(callInst, engine->length_StringAtom, INT_TO_JS2VAL(pf->length), ReadAccess, true, false); + pf++; + } + } + } + env->removeTopFrame(); + + } diff --git a/mozilla/js2/src/js2metadata.h b/mozilla/js2/src/js2metadata.h index f0f38f25c2a..2450bcc5978 100644 --- a/mozilla/js2/src/js2metadata.h +++ b/mozilla/js2/src/js2metadata.h @@ -931,7 +931,7 @@ public: js2val getIgnoreCase(JS2Metadata *meta); js2val getSource(JS2Metadata *meta); - REState *mRegExp; + JSRegExp *mRegExp; virtual ~RegExpInstance() { } }; @@ -1394,7 +1394,8 @@ public: bool deleteInstanceMember(JS2Class *c, QualifiedName *qname, bool *result); void addGlobalObjectFunction(char *name, NativeCode *code, uint32 length); - void initBuiltinClass(JS2Class *builtinClass, FunctionData *protoFunctions, FunctionData *staticFunctions, NativeCode *construct, NativeCode *call); + void initBuiltinClass(JS2Class *builtinClass, FunctionData *staticFunctions, NativeCode *construct, NativeCode *call); + void initBuiltinClassPrototype(JS2Class *builtinClass, FunctionData *protoFunctions); void reportError(Exception::Kind kind, const char *message, size_t pos, const char *arg = NULL); void reportError(Exception::Kind kind, const char *message, size_t pos, const String &name); @@ -1406,6 +1407,7 @@ public: float64 convertStringToDouble(const String *str); bool convertValueToBoolean(js2val x); int32 convertValueToInteger(js2val x); + uint32 convertValueToUInteger(js2val x); js2val convertValueToGeneralNumber(js2val x); js2val convertValueToObject(js2val x); @@ -1415,6 +1417,7 @@ public: js2val toGeneralNumber(js2val x) { if (JS2VAL_IS_NUMBER(x)) return x; else return convertValueToGeneralNumber(x); } bool toBoolean(js2val x) { if (JS2VAL_IS_BOOLEAN(x)) return JS2VAL_TO_BOOLEAN(x); else return convertValueToBoolean(x); } int32 toInteger(js2val x) { if (JS2VAL_IS_INT(x)) return JS2VAL_TO_INT(x); else return convertValueToInteger(x); } + uint32 toUInteger(js2val x) { if (JS2VAL_IS_INT(x)) return JS2VAL_TO_INT(x); else return convertValueToUInteger(x); } js2val toObject(js2val x) { if (JS2VAL_IS_OBJECT(x)) return x; else return convertValueToObject(x); } // x is a General Number int64 truncateToInteger(js2val x) { if (JS2VAL_IS_INT(x)) return JS2VAL_TO_INT(x); else return JS2Engine::float64toInt64(toFloat64(x)); } diff --git a/mozilla/js2/src/js2number.cpp b/mozilla/js2/src/js2number.cpp index e5c5c8314ff..ac228928e52 100644 --- a/mozilla/js2/src/js2number.cpp +++ b/mozilla/js2/src/js2number.cpp @@ -130,8 +130,9 @@ namespace MetaData { { NULL } }; + meta->initBuiltinClass(meta->numberClass, NULL, Number_Constructor, Number_Call); meta->numberClass->prototype = OBJECT_TO_JS2VAL(new NumberInstance(meta, meta->objectClass->prototype, meta->numberClass)); - meta->initBuiltinClass(meta->numberClass, &prototypeFunctions[0], NULL, Number_Constructor, Number_Call); + meta->initBuiltinClassPrototype(meta->numberClass, &prototypeFunctions[0]); } diff --git a/mozilla/js2/src/js2op_flowcontrol.cpp b/mozilla/js2/src/js2op_flowcontrol.cpp index a477f8d85ff..0f7f69b95a2 100644 --- a/mozilla/js2/src/js2op_flowcontrol.cpp +++ b/mozilla/js2/src/js2op_flowcontrol.cpp @@ -167,8 +167,7 @@ case eThrow: { - // leave exception object on stack top - throw Exception(Exception::userException, ""); + throw Exception(pop()); } break; diff --git a/mozilla/js2/src/js2regexp.cpp b/mozilla/js2/src/js2regexp.cpp index 0fd9afc4fc1..196d2c8d49a 100644 --- a/mozilla/js2/src/js2regexp.cpp +++ b/mozilla/js2/src/js2regexp.cpp @@ -125,7 +125,9 @@ namespace MetaData { js2val RegExp_toString(JS2Metadata *meta, const js2val thisValue, js2val *argv, uint32 argc) { - if (meta->objectType(thisValue) != meta->regexpClass) + if (!JS2VAL_IS_OBJECT(thisValue) + || (JS2VAL_TO_OBJECT(thisValue)->kind != SimpleInstanceKind) + || (checked_cast(JS2VAL_TO_OBJECT(thisValue))->type != meta->regexpClass)) meta->reportError(Exception::typeError, "RegExp.toString can only be applied to RegExp objects", meta->engine->errorPos()); RegExpInstance *thisInst = checked_cast(JS2VAL_TO_OBJECT(thisValue)); js2val srcval = thisInst->getSource(meta); @@ -149,23 +151,29 @@ namespace MetaData { js2val RegExp_exec(JS2Metadata *meta, const js2val thisValue, js2val *argv, uint32 argc) { - if (meta->objectType(thisValue) != meta->regexpClass) + if (!JS2VAL_IS_OBJECT(thisValue) + || (JS2VAL_TO_OBJECT(thisValue)->kind != SimpleInstanceKind) + || (checked_cast(JS2VAL_TO_OBJECT(thisValue))->type != meta->regexpClass)) meta->reportError(Exception::typeError, "RegExp.exec can only be applied to RegExp objects", meta->engine->errorPos()); RegExpInstance *thisInst = checked_cast(JS2VAL_TO_OBJECT(thisValue)); js2val result = JS2VAL_NULL; if (argc > 0) { - int32 index = 0; + uint32 index = 0; const String *str = meta->toString(argv[0]); js2val globalMultiline = thisInst->getMultiline(meta); if (meta->toBoolean(thisInst->getGlobal(meta))) { - js2val lastIndex = thisInst->getLastIndex(meta); - index = meta->toInteger(lastIndex); + js2val lastIndexVal = thisInst->getLastIndex(meta); + float64 lastIndex = meta->toFloat64(lastIndexVal); + if ((lastIndex < 0) || (lastIndex > str->length())) { + thisInst->setLastIndex(meta, meta->engine->allocNumber(0.0)); + return result; + } + index = meta->engine->float64toUInt32(lastIndex); } - - REMatchState *match = REExecute(thisInst->mRegExp, str->begin(), index, toInt32(str->length()), meta->toBoolean(globalMultiline)); + REMatchResult *match = REExecute(meta, thisInst->mRegExp, str->begin(), index, toUInt32(str->length()), meta->toBoolean(globalMultiline)); if (match) { ArrayInstance *A = new ArrayInstance(meta, meta->arrayClass->prototype, meta->arrayClass); DEFINE_ROOTKEEPER(rk, A); @@ -184,21 +192,13 @@ namespace MetaData { meta->createDynamicProperty(A, meta->engine->allocStringPtr("index"), meta->engine->allocNumber((float64)(match->startIndex)), ReadWriteAccess, false, true); meta->createDynamicProperty(A, meta->engine->allocStringPtr("input"), meta->engine->allocString(str), ReadWriteAccess, false, true); - -/* - // XXX SpiderMonkey also adds 'index' and 'input' properties to the result - JSValue::instance(result)->setProperty(cx, cx->Index_StringAtom, CURRENT_ATTR, JSValue::newNumber((float64)(match->startIndex))); - JSValue::instance(result)->setProperty(cx, cx->Input_StringAtom, CURRENT_ATTR, JSValue::newString(str)); - - // XXX Set up the SpiderMonkey 'RegExp statics' - RegExp_Type->setProperty(cx, cx->LastMatch_StringAtom, CURRENT_ATTR, JSValue::newString(matchStr)); - RegExp_Type->setProperty(cx, cx->LastParen_StringAtom, CURRENT_ATTR, JSValue::newString(parenStr)); - String *contextStr = new String(str->substr(0, (uint32)match->startIndex)); - RegExp_Type->setProperty(cx, cx->LeftContext_StringAtom, CURRENT_ATTR, JSValue::newString(contextStr)); - contextStr = new String(str->substr((uint32)match->endIndex, (uint32)str->length() - match->endIndex)); - RegExp_Type->setProperty(cx, cx->RightContext_StringAtom, CURRENT_ATTR, JSValue::newString(contextStr)); -*/ + meta->stringClass->writePublic(meta, OBJECT_TO_JS2VAL(meta->regexpClass), meta->stringClass, meta->engine->allocStringPtr("lastMatch"), true, matchStr); + js2val leftContextVal = meta->engine->allocString(str->substr(0, (uint32)match->startIndex)); + meta->stringClass->writePublic(meta, OBJECT_TO_JS2VAL(meta->regexpClass), meta->stringClass, meta->engine->allocStringPtr("leftContext"), true, matchStr); + js2val rightContextVal = meta->engine->allocString(str->substr((uint32)match->endIndex, (uint32)str->length() - match->endIndex)); + meta->stringClass->writePublic(meta, OBJECT_TO_JS2VAL(meta->regexpClass), meta->stringClass, meta->engine->allocStringPtr("rightContext"), true, matchStr); + if (meta->toBoolean(thisInst->getGlobal(meta))) { index = match->endIndex; thisInst->setLastIndex(meta, meta->engine->allocNumber((float64)index)); @@ -210,19 +210,33 @@ namespace MetaData { return result; } + js2val RegExp_Call(JS2Metadata *meta, const js2val thisValue, js2val *argv, uint32 argc) + { + if ((argc > 0) + && (JS2VAL_IS_OBJECT(argv[0]) + && (JS2VAL_TO_OBJECT(argv[0])->kind == SimpleInstanceKind) + && (checked_cast(JS2VAL_TO_OBJECT(argv[0]))->type == meta->regexpClass)) + && ((argc == 1) || JS2VAL_IS_UNDEFINED(argv[1]))) + return argv[0]; + else + return RegExp_Constructor(meta, thisValue, argv, argc); + } + js2val RegExp_Constructor(JS2Metadata *meta, const js2val /* thisValue */, js2val *argv, uint32 argc) { - // XXX Change constructors to take js2val pointer for the result (which would be an already - // rooted pointer). RegExpInstance *thisInst = new RegExpInstance(meta, meta->regexpClass->prototype, meta->regexpClass); DEFINE_ROOTKEEPER(rk, thisInst); js2val thatValue = OBJECT_TO_JS2VAL(thisInst); - REuint32 flags = 0; + uint32 flags = 0; const String *regexpStr = meta->engine->Empty_StringAtom; + DEFINE_ROOTKEEPER(rk1, regexpStr); const String *flagStr = meta->engine->Empty_StringAtom; + DEFINE_ROOTKEEPER(rk2, flagStr); if (argc > 0) { - if (meta->objectType(argv[0]) == meta->regexpClass) { + if (JS2VAL_IS_OBJECT(argv[0]) + && (JS2VAL_TO_OBJECT(argv[0])->kind == SimpleInstanceKind) + && (checked_cast(JS2VAL_TO_OBJECT(argv[0]))->type == meta->regexpClass)) { if ((argc == 1) || JS2VAL_IS_UNDEFINED(argv[1])) { RegExpInstance *otherInst = checked_cast(JS2VAL_TO_OBJECT(argv[0])); js2val src = otherInst->getSource(meta); @@ -237,19 +251,19 @@ namespace MetaData { regexpStr = meta->toString(argv[0]); if ((argc > 1) && !JS2VAL_IS_UNDEFINED(argv[1])) { flagStr = meta->toString(argv[1]); - if (parseFlags(flagStr->begin(), (int32)flagStr->length(), &flags) != RE_NO_ERROR) + if (!parseFlags(meta, flagStr->begin(), (int32)flagStr->length(), &flags)) meta->reportError(Exception::syntaxError, "Failed to parse RegExp : '{0}'", meta->engine->errorPos(), *regexpStr + "/" + *flagStr); // XXX error message? } } - REState *pState = REParse(regexpStr->begin(), (int32)regexpStr->length(), flags, RE_VERSION_1); - if (pState) { - thisInst->mRegExp = pState; + JSRegExp *re = RECompile(meta, regexpStr->begin(), (int32)regexpStr->length(), flags); + if (re) { + thisInst->mRegExp = re; // XXX ECMA spec says these are DONTENUM thisInst->setSource(meta, STRING_TO_JS2VAL(regexpStr)); - thisInst->setGlobal(meta, BOOLEAN_TO_JS2VAL((pState->flags & RE_GLOBAL) == RE_GLOBAL)); - thisInst->setIgnoreCase(meta, BOOLEAN_TO_JS2VAL((pState->flags & RE_IGNORECASE) == RE_IGNORECASE)); + thisInst->setGlobal(meta, BOOLEAN_TO_JS2VAL((re->flags & JSREG_GLOB) == JSREG_GLOB)); + thisInst->setIgnoreCase(meta, BOOLEAN_TO_JS2VAL((re->flags & JSREG_FOLD) == JSREG_FOLD)); thisInst->setLastIndex(meta, INT_TO_JS2VAL(0)); - thisInst->setMultiline(meta, BOOLEAN_TO_JS2VAL((pState->flags & RE_MULTILINE) == RE_MULTILINE)); + thisInst->setMultiline(meta, BOOLEAN_TO_JS2VAL((re->flags & JSREG_MULTILINE) == JSREG_MULTILINE)); } else meta->reportError(Exception::syntaxError, "Failed to parse RegExp : '{0}'", meta->engine->errorPos(), "/" + *regexpStr + "/" + *flagStr); // XXX what about the RE parser error message? @@ -258,7 +272,6 @@ namespace MetaData { void initRegExpObject(JS2Metadata *meta) { - meta->regexpClass->prototype = OBJECT_TO_JS2VAL(new SimpleInstance(meta, OBJECT_TO_JS2VAL(meta->objectClass->prototype), meta->dateClass)); FunctionData prototypeFunctions[] = { @@ -267,9 +280,17 @@ namespace MetaData { { NULL } }; - meta->initBuiltinClass(meta->regexpClass, &prototypeFunctions[0], NULL, RegExp_Constructor, RegExp_Constructor); - - + meta->initBuiltinClass(meta->regexpClass, NULL, RegExp_Constructor, RegExp_Call); + meta->env->addFrame(meta->regexpClass); + { + Variable *v = new Variable(meta->stringClass, meta->engine->allocString(""), false); + meta->defineLocalMember(meta->env, &meta->world.identifiers["lastMatch"], NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); + v = new Variable(meta->stringClass, meta->engine->allocString(""), false); + meta->defineLocalMember(meta->env, &meta->world.identifiers["leftContext"], NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); + v = new Variable(meta->stringClass, meta->engine->allocString(""), false); + meta->defineLocalMember(meta->env, &meta->world.identifiers["rightContext"], NULL, Attribute::NoOverride, false, ReadWriteAccess, v, 0, false); + } + meta->env->removeTopFrame(); NamespaceList publicNamespaceList; publicNamespaceList.push_back(meta->publicNamespace); @@ -287,7 +308,6 @@ namespace MetaData { { "lastIndex", meta->numberClass }, }; - for (uint32 i = 0; i < INSTANCE_VAR_COUNT; i++) { Multiname *mn = new Multiname(meta->engine->allocStringPtr(RegExpInstanceVars[i].name), &publicNamespaceList); @@ -295,6 +315,16 @@ namespace MetaData { meta->defineInstanceMember(meta->regexpClass, &meta->cxt, mn->name, *mn->nsList, Attribute::NoOverride, false, m, 0); } + RegExpInstance *reProto = new RegExpInstance(meta, OBJECT_TO_JS2VAL(meta->objectClass->prototype), meta->regexpClass); + DEFINE_ROOTKEEPER(rk, reProto); + meta->regexpClass->prototype = OBJECT_TO_JS2VAL(reProto); + meta->initBuiltinClassPrototype(meta->regexpClass, &prototypeFunctions[0]); + + reProto->setSource(meta, meta->engine->allocString("")); + reProto->setGlobal(meta, JS2VAL_FALSE); + reProto->setIgnoreCase(meta, JS2VAL_FALSE); + reProto->setLastIndex(meta, INT_TO_JS2VAL(0)); + reProto->setMultiline(meta, JS2VAL_FALSE); } diff --git a/mozilla/js2/src/js2string.cpp b/mozilla/js2/src/js2string.cpp index 9b6b752e2eb..58085713fdc 100644 --- a/mozilla/js2/src/js2string.cpp +++ b/mozilla/js2/src/js2string.cpp @@ -131,9 +131,9 @@ static js2val String_search(JS2Metadata *meta, const js2val thisValue, js2val *a regexp = JS2VAL_NULL; regexp = RegExp_Constructor(meta, regexp, argv, 1); } - REState *pState = (checked_cast(JS2VAL_TO_OBJECT(regexp)))->mRegExp; + JSRegExp *re = (checked_cast(JS2VAL_TO_OBJECT(regexp)))->mRegExp; - REMatchState *match = REExecute(pState, str->begin(), 0, (int32)str->length(), false); + REMatchResult *match = REExecute(meta, re, str->begin(), 0, (int32)str->length(), false); if (match) return meta->engine->allocNumber((float64)(match->startIndex)); else @@ -167,8 +167,8 @@ static js2val String_match(JS2Metadata *meta, const js2val thisValue, js2val *ar RegExpInstance *thisInst = checked_cast(JS2VAL_TO_OBJECT(regexp)); DEFINE_ROOTKEEPER(rk1, thisInst); - REState *pState = thisInst->mRegExp; - if ((pState->flags & RE_GLOBAL) == 0) { + JSRegExp *re = thisInst->mRegExp; + if ((re->flags & JSREG_GLOB) == 0) { return RegExp_exec(meta, regexp, &S, 1); } else { @@ -177,7 +177,7 @@ static js2val String_match(JS2Metadata *meta, const js2val thisValue, js2val *ar int32 index = 0; int32 lastIndex = 0; while (true) { - REMatchState *match = REExecute(pState, JS2VAL_TO_STRING(S)->begin(), lastIndex, toInt32(JS2VAL_TO_STRING(S)->length()), false); + REMatchResult *match = REExecute(meta, re, JS2VAL_TO_STRING(S)->begin(), lastIndex, toInt32(JS2VAL_TO_STRING(S)->length()), false); if (match == NULL) break; if (lastIndex == match->endIndex) @@ -185,27 +185,27 @@ static js2val String_match(JS2Metadata *meta, const js2val thisValue, js2val *ar else lastIndex = match->endIndex; js2val matchStr = meta->engine->allocString(JS2VAL_TO_STRING(S)->substr(toUInt32(match->startIndex), toUInt32(match->endIndex) - match->startIndex)); - index++; meta->arrayClass->writePublic(meta, OBJECT_TO_JS2VAL(A), meta->arrayClass, meta->engine->numberToString(index), true, matchStr); + index++; } thisInst->setLastIndex(meta, meta->engine->allocNumber((float64)lastIndex)); return OBJECT_TO_JS2VAL(A); } } -static const String interpretDollar(JS2Metadata *meta, const String *replaceStr, uint32 dollarPos, const String *searchStr, REMatchState *match, uint32 &skip) +static const String interpretDollar(JS2Metadata *meta, const String *replaceStr, uint32 dollarPos, const String *searchStr, REMatchResult *match, uint32 &skip) { skip = 2; const char16 *dollarValue = replaceStr->begin() + dollarPos + 1; switch (*dollarValue) { case '$': - return *meta->engine->Dollar_StringAtom; + return *meta->engine->Dollar_StringAtom; case '&': - return searchStr->substr((uint32)match->startIndex, (uint32)match->endIndex - match->startIndex); + return searchStr->substr((uint32)match->startIndex, (uint32)match->endIndex - match->startIndex); case '`': - return searchStr->substr(0, (uint32)match->startIndex); + return searchStr->substr(0, (uint32)match->startIndex); case '\'': - return searchStr->substr((uint32)match->endIndex, (uint32)searchStr->length() - match->endIndex); + return searchStr->substr((uint32)match->endIndex, (uint32)searchStr->length() - match->endIndex); case '0': case '1': case '2': @@ -216,24 +216,23 @@ static const String interpretDollar(JS2Metadata *meta, const String *replaceStr, case '7': case '8': case '9': - { - int32 num = (int32)(*dollarValue - '0'); - if (num <= match->parenCount) { - if ((dollarPos < (replaceStr->length() - 2)) - && (dollarValue[1] >= '0') && (dollarValue[1] <= '9')) { - int32 tmp = (num * 10) + (dollarValue[1] - '0'); - if (tmp <= match->parenCount) { - num = tmp; - skip = 3; - } - } - return searchStr->substr((uint32)(match->parens[num - 1].index), (uint32)(match->parens[num - 1].length)); - } - } - // fall thru + { + int32 num = (int32)(*dollarValue - '0'); + if (num <= match->parenCount) { + if ((dollarPos < (replaceStr->length() - 2)) && (dollarValue[1] >= '0') && (dollarValue[1] <= '9')) { + int32 tmp = (num * 10) + (dollarValue[1] - '0'); + if (tmp <= match->parenCount) { + num = tmp; + skip = 3; + } + } + return searchStr->substr((uint32)(match->parens[num - 1].index), (uint32)(match->parens[num - 1].length)); + } + } + // fall thru default: - skip = 1; - return *meta->engine->Dollar_StringAtom; + skip = 1; + return *meta->engine->Dollar_StringAtom; } } @@ -281,51 +280,51 @@ static js2val String_replace(JS2Metadata *meta, const js2val thisValue, js2val * if (meta->objectType(searchValue) != meta->regexpClass) { RegExpInstance *reInst = checked_cast(JS2VAL_TO_OBJECT(searchValue)); - REState *pState = reInst->mRegExp; - REMatchState *match; + JSRegExp *re = reInst->mRegExp; + REMatchResult *match; String newString; int32 lastIndex = 0; while (true) { - match = REExecute(pState, S->begin(), lastIndex, toInt32(S->length()), false); + match = REExecute(meta, re, S->begin(), lastIndex, toInt32(S->length()), false); if (match) { - String insertString; - uint32 start = 0; - while (true) { - // look for '$' in the replacement string and interpret it as necessary - uint32 dollarPos = replaceStr->find('$', start); - if ((dollarPos != String::npos) && (dollarPos < (replaceStr->length() - 1))) { - uint32 skip; - insertString += replaceStr->substr(start, dollarPos - start); - insertString += interpretDollar(meta, replaceStr, dollarPos, S, match, skip); - start = dollarPos + skip; - } - else { - // otherwise, absorb the entire replacement string - insertString += replaceStr->substr(start, replaceStr->length() - start); - break; + String insertString; + uint32 start = 0; + while (true) { + // look for '$' in the replacement string and interpret it as necessary + uint32 dollarPos = replaceStr->find('$', start); + if ((dollarPos != String::npos) && (dollarPos < (replaceStr->length() - 1))) { + uint32 skip; + insertString += replaceStr->substr(start, dollarPos - start); + insertString += interpretDollar(meta, replaceStr, dollarPos, S, match, skip); + start = dollarPos + skip; + } + else { + // otherwise, absorb the entire replacement string + insertString += replaceStr->substr(start, replaceStr->length() - start); + break; + } } + // grab everything preceding the match + newString += S->substr(toUInt32(lastIndex), toUInt32(match->startIndex) - lastIndex); + // and then add the replacement string + newString += insertString; } - // grab everything preceding the match - newString += S->substr(toUInt32(lastIndex), toUInt32(match->startIndex) - lastIndex); - // and then add the replacement string - newString += insertString; + else + break; + lastIndex = match->endIndex; // use lastIndex to grab remainder after break + if ((re->flags & JSREG_GLOB) == 0) + break; } - else - break; - lastIndex = match->endIndex; // use lastIndex to grab remainder after break - if ((pState->flags & RE_GLOBAL) == 0) - break; - } - newString += S->substr(toUInt32(lastIndex), toUInt32(S->length()) - lastIndex); - if ((pState->flags & RE_GLOBAL) == 0) - reInst->setLastIndex(meta, meta->engine->allocNumber((float64)lastIndex)); + newString += S->substr(toUInt32(lastIndex), toUInt32(S->length()) - lastIndex); + if ((re->flags & JSREG_GLOB) == 0) + reInst->setLastIndex(meta, meta->engine->allocNumber((float64)lastIndex)); return meta->engine->allocString(newString); } else { const String *searchStr = meta->toString(searchValue); DEFINE_ROOTKEEPER(rk3, searchStr); - REMatchState match; + REMatchResult match; uint32 pos = S->find(*searchStr, 0); if (pos == String::npos) return STRING_TO_JS2VAL(S); @@ -381,12 +380,12 @@ static void strSplitMatch(const String *S, uint32 q, const String *R, MatchResul result.failure = false; } -static void regexpSplitMatch(JS2Metadata *meta, const String *S, uint32 q, REState *RE, MatchResult &result) +static void regexpSplitMatch(JS2Metadata *meta, const String *S, uint32 q, JSRegExp *RE, MatchResult &result) { result.failure = true; result.captures = NULL; - REMatchState *match = REMatch(RE, S->begin() + q, (int32)(S->length() - q)); + REMatchResult *match = REMatch(meta, RE, S->begin() + q, (int32)(S->length() - q)); if (match) { result.endIndex = match->startIndex + q; @@ -428,7 +427,7 @@ static js2val String_split(JS2Metadata *meta, const js2val thisValue, js2val *ar uint32 s = S->size(); uint32 p = 0; - REState *RE = NULL; + JSRegExp *RE = NULL; const String *R = NULL; if (meta->objectType(separatorV) == meta->regexpClass) RE = (checked_cast(JS2VAL_TO_OBJECT(separatorV)))->mRegExp; @@ -817,8 +816,9 @@ void initStringObject(JS2Metadata *meta) meta->stringClass->prototype = OBJECT_TO_JS2VAL(strInst); strInst->mValue = meta->engine->allocStringPtr(""); + meta->initBuiltinClass(meta->stringClass, &staticFunctions[0], String_Constructor, String_Call); meta->createDynamicProperty(strInst, meta->engine->length_StringAtom, meta->engine->allocNumber(strInst->mValue->length()), ReadAccess, true, false); - meta->initBuiltinClass(meta->stringClass, &prototypeFunctions[0], &staticFunctions[0], String_Constructor, String_Call); + meta->initBuiltinClassPrototype(meta->stringClass, &prototypeFunctions[0]); } diff --git a/mozilla/js2/src/jslong.h b/mozilla/js2/src/jslong.h index b3206b39c16..a6362010d39 100644 --- a/mozilla/js2/src/jslong.h +++ b/mozilla/js2/src/jslong.h @@ -161,6 +161,7 @@ extern int64 JSLL_Zero(); ***********************************************************************/ #define JSLL_L2I(i, l) ((i) = (int32)(l)) #define JSLL_UL2I(i, ul) ((i) = (int32)(ul)) +#define JSLL_UL2UI(ui, ul) ((i) = (uint32)(ul)) #define JSLL_L2UI(ui, l) ((ui) = (uint32)(l)) #define JSLL_L2F(f, l) ((f) = (float64)(l)) #define JSLL_L2D(d, l) ((d) = (float64)(l)) diff --git a/mozilla/js2/src/regexp/regexp.c b/mozilla/js2/src/regexp/regexp.c index f38e0da5384..b6f33dcd99b 100644 --- a/mozilla/js2/src/regexp/regexp.c +++ b/mozilla/js2/src/regexp/regexp.c @@ -31,352 +31,514 @@ * file under either the NPL or the GPL. */ -#ifdef STANDALONE -#include -#endif - -#include -#include - -#ifndef ASSERT -#include -#define ASSERT assert -#endif -#include "regexp.h" +#define RE_IS_LETTER(c) ( ((c >= 'A') && (c <= 'Z')) || \ + ((c >= 'a') && (c <= 'z')) ) +#define RE_IS_LINE_TERM(c) ( (c == '\n') || (c == '\r') || \ + (c == LINE_SEPARATOR) || (c == PARAGRAPH_SEPARATOR)) - -typedef unsigned char REbool; -enum { RE_FALSE, RE_TRUE }; - - - -typedef enum REOp { - REOP_EMPTY, /* an empty alternative */ - - REOP_ALT, /* a tree of alternates */ - REOP_ENDALT, /* flags end of alternate, signals jump to next */ - - REOP_BOL, /* start of line or input string '^' */ - REOP_EOL, /* end of line or input string '$' */ - - REOP_DOT, - REOP_CLASS, /* '[...]' */ - - REOP_PAREN, /* capturing */ - REOP_CLOSEPAREN, /* continuation for end of paren */ - REOP_BACKREF, - - REOP_WBND, /* word boundary '\b' */ - REOP_UNWBND, /* not a word boundary '\B' */ - - REOP_ASSERT, /* '(?= ... )' */ - REOP_ASSERTTEST, /* end of child for above */ - REOP_ASSERTNOT, /* '(?! ... )' */ - REOP_ASSERTNOTTEST, /* end of child for above */ - - REOP_FLAT, /* literal characters (data.length count) */ - /* tree node of FLAT gets transformed into : */ - REOP_FLATNi, /* 'n' literals characters, ignore case */ - REOP_FLATN, /* 'n' literals characters, case sensitive */ - REOP_FLAT1i, /* 1 literal characters, ignore case */ - REOP_FLAT1, /* 1 literal characters, case sensitive */ - - REOP_DEC, /* decimal digit '\d' */ - REOP_UNDEC, /* not a decimal digit '\D' */ - - REOP_WS, /* whitespace or line terminator '\s' */ - REOP_UNWS, /* not whitespace '\S' */ - - REOP_LETDIG, /* letter or digit or '_' '\w' */ - REOP_UNLETDIG, /* not letter or digit or '_' '\W' */ - - REOP_QUANT, /* '+', '*', '?' as tree node or '{..}' */ - REOP_STAR, /* Bytecode versions, to save space ... */ - REOP_OPT, - REOP_PLUS, - REOP_MINIMALSTAR, - REOP_MINIMALOPT, - REOP_MINIMALPLUS, - REOP_MINIMALQUANT, - - REOP_REPEAT, /* intermediate op for processing quant */ - REOP_MINIMALREPEAT, /* and ditto for non greedy case */ - - REOP_ENDCHILD, /* end of child for quantifier */ - - REOP_END /* the holy grail */ - -} REOp; - -#define RE_ISDEC(c) ( (c >= '0') && (c <= '9') ) -#define RE_UNDEC(c) (c - '0') - -#define RE_ISLETTER(c) ( ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')) ) -#define RE_ISLETDIG(c) ( RE_ISLETTER(c) || RE_ISDEC(c) ) - -#define RE_ISLINETERM(c) ( (c == '\n') || (c == '\r') ) - - -typedef struct REContinuationData { - REOp op; /* not necessarily the same as *pc */ - REuint8 *pc; -} REContinuationData; - -typedef struct RENode { - REOp kind; - RENode *next; /* links consecutive terms */ - void *child; - REuint32 parenIndex; /* for QUANT, PAREN, BACKREF */ - union { - void *child2; /* for ALT */ - struct { - REint32 min; - REint32 max; /* -1 for infinity */ - REbool greedy; - REint32 parenCount; /* #parens in quantified term */ - } quantifier; - struct { - REchar ch; /* for FLAT1 */ - REuint32 length; /* for FLATN */ - } flat; - struct { - REint32 classIndex; /* index into classList in REState */ - const REchar *end; /* last character of source */ - REuint32 length; /* calculated bitmap length */ - } chclass; - } data; -} RENode; +#define CLASS_CACHE_SIZE (4) +typedef struct CompilerState { + Pool *reNodePool; + const jschar *cpbegin; + const jschar *cpend; + const jschar *cp; + uintN flags; + uint16 parenCount; + uint16 classCount; /* number of [] encountered */ + size_t progLength; /* estimated bytecode length */ + uintN treeDepth; /* maximum depth of parse tree */ + RENode *result; + struct { + const jschar *start; /* small cache of class strings */ + uint16 length; /* since they're often the same */ + uint16 index; + } classCache[CLASS_CACHE_SIZE]; +} CompilerState; typedef struct REProgState { - REint32 min; - REint32 max; - REint32 parenCount; - REint32 parenIndex; - REint32 index; - REContinuationData continuation; + jsbytecode *continue_pc; /* current continuation data */ + REOp continue_op; + int16 index; /* progress in text */ + uintN parenSoFar; /* highest indexed paren started */ + union { + struct { + uint16 min; /* current quantifier limits */ + uint16 max; + } quantifier; + struct { + size_t top; /* backtrack stack state */ + size_t sz; + } assertion; + } u; } REProgState; -#define INITIAL_STATESTACK (2000) -REProgState *stateStack; -REuint32 stateStackTop; -REuint32 maxStateStack; - -typedef struct REGlobalData { - REState *regexp; /* the RE in execution */ - REint32 length; /* length of input string */ - const REchar *input; /* the input string */ - RE_Error error; /* runtime error code (out_of_memory only?) */ - REint32 lastParen; /* highest paren set so far */ - REbool globalMultiline; /* as specified for current execution */ -} REGlobalData; - typedef struct REBackTrackData { - REContinuationData continuation; /* where to backtrack to */ - REMatchState *state; /* the state of the match */ - REint32 lastParen; - REProgState *precedingState; - REint32 precedingStateTop; + size_t sz; /* size of previous stack entry */ + jsbytecode *backtrack_pc; /* where to backtrack to */ + REOp backtrack_op; + const jschar *cp; /* index in text of match at backtrack */ + intN parenIndex; /* start index of saved paren contents */ + uint16 parenCount; /* # of saved paren contents */ + uint16 precedingStateTop; /* number of parent states */ + /* saved parent states follow */ + /* saved paren contents follow */ } REBackTrackData; -#define INITIAL_BACKTRACK (20) -REint32 maxBackTrack; -REBackTrackData *backTrackStack = NULL; -REint32 backTrackStackTop; +#define INITIAL_STATESTACK (100) +#define INITIAL_BACKTRACK (8000) -/* - Allocate space for a state and copy x into it. -*/ -static REMatchState *copyState(REMatchState *x) +typedef struct REGlobalData { + JSRegExpStatics *regExpStatics; + JSRegExp *regexp; /* the RE in execution */ + JSBool ok; /* runtime error (out_of_memory only?) */ + size_t start; /* offset to start at */ + ptrdiff_t skipped; /* chars skipped anchoring this r.e. */ + const jschar *cpbegin, *cpend; /* text base address and limit */ + + REProgState *stateStack; /* stack of state of current parents */ + uint16 stateStackTop; + uint16 maxStateStack; + + REBackTrackData *backTrackStack;/* stack of matched-so-far positions */ + REBackTrackData *backTrackSP; + size_t maxBackTrack; + size_t cursz; /* size of current stack entry */ + +} REGlobalData; + +bool JS_ISWORD(jschar ch) { - REuint32 sz = sizeof(REMatchState) + (x->parenCount * sizeof(RECapture)); - REMatchState *result = (REMatchState *)malloc(sz); - memcpy(result, x, sz); - return result; + CharInfo chi(ch); + return ch == '_' || isAlphanumeric(chi); +} + +bool JS_ISSPACE(jschar ch) +{ + CharInfo chi(ch); + return isSpace(chi); +} + +bool JS_ISDIGIT(jschar ch) +{ + CharInfo chi(ch); + return isDecimalDigit(chi); } /* - Copy state. -*/ -static void recoverState(REMatchState *into, REMatchState *from) + * 1. If IgnoreCase is false, return ch. + * 2. Let u be ch converted to upper case as if by calling + * String.prototype.toUpperCase on the one-character string ch. + * 3. If u does not consist of a single character, return ch. + * 4. Let cu be u's character. + * 5. If ch's code point value is greater than or equal to decimal 128 and cu's + * code point value is less than decimal 128, then return ch. + * 6. Return cu. + */ +static jschar +canonicalize(jschar ch) { - memcpy(into, from, sizeof(REMatchState) - + (from->parenCount * sizeof(RECapture))); + jschar cu = toUpper(ch); + if ((ch >= 128) && (cu < 128)) return ch; + return cu; +} + +/* Construct and initialize an RENode, returning NULL for out-of-memory */ +static RENode * +NewRENode(CompilerState *state, REOp op) +{ + RENode *ren; + ren = new (*state->reNodePool) RENode(); + + if (!ren) { + JS_ReportOutOfMemory(); + return NULL; + } + ren->op = op; + ren->next = NULL; + ren->kid = NULL; + return ren; } /* - Bottleneck for any errors. -*/ -static void reportRegExpError(RE_Error *errP, RE_Error err) + * Validates and converts hex ascii value. + */ +static JSBool +isASCIIHexDigit(jschar c, uintN *digit) { - *errP = err; -} - -/* forward declarations for parser routines */ -REbool parseDisjunction(REState *parseState); -REbool parseAlternative(REState *parseState); -REbool parseTerm(REState *parseState); - - -static REbool isASCIIHexDigit(REchar c, REuint32 *digit) -{ - REuint32 cv = c; + uintN cv = c; if (cv < '0') - return RE_FALSE; + return JS_FALSE; if (cv <= '9') { *digit = cv - '0'; - return RE_TRUE; + return JS_TRUE; } cv |= 0x20; if (cv >= 'a' && cv <= 'f') { *digit = cv - 'a' + 10; - return RE_TRUE; + return JS_TRUE; } - return RE_FALSE; + return JS_FALSE; } -/* - Allocate & initialize a new node. -*/ -static RENode *newRENode(REState *pState, REOp kind) +typedef struct { + REOp op; + const jschar *errPos; + uint16 parenIndex; +} REOpData; + + +/* + * Process the op against the two top operands, reducing them to a single + * operand in the penultimate slot. Update progLength and treeDepth. + */ +static JSBool +processOp(CompilerState *state, REOpData *opData, RENode **operandStack, intN operandSP) { - RENode *result = (RENode *)malloc(sizeof(RENode)); - if (result == NULL) { - reportRegExpError(&pState->error, RE_OUT_OF_MEMORY); - return NULL; + RENode *result; + + switch (opData->op) { + case REOP_ALT: + result = NewRENode(state, REOP_ALT); + if (!result) + return JS_FALSE; + result->kid = operandStack[operandSP - 2]; + result->u.kid2 = operandStack[operandSP - 1]; + operandStack[operandSP - 2] = result; + /* + * look at both alternates to see if there's a FLAT or a CLASS at + * the start of each. If so, use a prerequisite match + */ + ++state->treeDepth; + if ((((RENode *)(result->kid))->op == REOP_FLAT) + && (((RENode *)(result->u.kid2))->op == REOP_FLAT) + && ((state->flags & JSREG_FOLD) == 0) ) { + result->op = REOP_ALTPREREQ; + result->u.altprereq.ch1 + = ((RENode *)(result->kid))->u.flat.chr; + result->u.altprereq.ch2 + = ((RENode *)(result->u.kid2))->u.flat.chr; + /* ALTPREREQ, , uch1, uch2, , ..., + JUMP, ... ENDALT */ + state->progLength += 13; + } + else + if ((((RENode *)(result->kid))->op == REOP_CLASS) + && (((RENode *)(result->kid))->u.ucclass.index < 256) + && (((RENode *)(result->u.kid2))->op == REOP_FLAT) + && ((state->flags & JSREG_FOLD) == 0) ) { + result->op = REOP_ALTPREREQ2; + result->u.altprereq.ch1 + = ((RENode *)(result->u.kid2))->u.flat.chr; + result->u.altprereq.ch2 + = ((RENode *)(result->kid))->u.ucclass.index; + /* ALTPREREQ2, , uch1, uch2, , ..., + JUMP, ... ENDALT */ + state->progLength += 13; + } + else + if ((((RENode *)(result->kid))->op == REOP_FLAT) + && (((RENode *)(result->u.kid2))->op == REOP_CLASS) + && (((RENode *)(result->u.kid2))->u.ucclass.index < 256) + && ((state->flags & JSREG_FOLD) == 0) ) { + result->op = REOP_ALTPREREQ2; + result->u.altprereq.ch1 + = ((RENode *)(result->kid))->u.flat.chr; + result->u.altprereq.ch2 + = ((RENode *)(result->u.kid2))->u.ucclass.index; + /* ALTPREREQ2, , uch1, uch2, , ..., + JUMP, ... ENDALT */ + state->progLength += 13; + } + else + /* ALT, , ..., JUMP, ... ENDALT */ + state->progLength += 7; + break; + case REOP_CONCAT: + result = operandStack[operandSP - 2]; + while (result->next) + result = result->next; + result->next = operandStack[operandSP - 1]; + break; + case REOP_ASSERT: + case REOP_ASSERT_NOT: + case REOP_LPARENNON: + case REOP_LPAREN: + /* These should have been processed by a close paren. */ + js_ReportCompileErrorNumber(JSMSG_MISSING_PAREN, opData->errPos); + return JS_FALSE; } - result->kind = kind; - result->data.flat.length = 1; - result->next = NULL; - result->child = NULL; + return JS_TRUE; +} + +/* + * Parser forward declarations. + */ +static JSBool parseTerm(CompilerState *state); +static JSBool parseQuantifier(CompilerState *state); + +/* + * Top-down regular expression grammar, based closely on Perl4. + * + * regexp: altern A regular expression is one or more + * altern '|' regexp alternatives separated by vertical bar. + */ + +#define INITIAL_STACK_SIZE (128) +static JSBool +parseRegExp(CompilerState *state) +{ + const jschar *errPos; + uint16 parenIndex; + RENode *operand; + REOpData *operatorStack; + RENode **operandStack; + REOp op; + intN i; + JSBool result = JS_FALSE; + + intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE; + intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE; + + /* Watch out for empty regexp */ + if (state->cp == state->cpend) { + state->result = NewRENode(state, REOP_EMPTY); + return JS_TRUE; + } + + operatorStack = (REOpData *)JS_malloc(state->context, + sizeof(REOpData) * operatorStackSize); + if (!operatorStack) + return JS_FALSE; + + operandStack = (RENode **)JS_malloc(state->context, + sizeof(RENode *) * operandStackSize); + if (!operandStack) + goto out; + + + while (JS_TRUE) { + if (state->cp != state->cpend) { + switch (*state->cp) { + /* balance '(' */ + case '(': /* balance ')' */ + errPos = state->cp; + ++state->cp; + if ((state->cp < state->cpend) && (*state->cp == '?') + && ( (state->cp[1] == '=') + || (state->cp[1] == '!') + || (state->cp[1] == ':') )) { + ++state->cp; + if (state->cp == state->cpend) { + js_ReportCompileErrorNumber(JSMSG_MISSING_PAREN, + errPos); + goto out; + } + switch (*state->cp++) { + case '=': + op = REOP_ASSERT; + /* ASSERT, , ... ASSERTTEST */ + state->progLength += 4; + break; + case '!': + op = REOP_ASSERT_NOT; + /* ASSERTNOT, , ... ASSERTNOTTEST */ + state->progLength += 4; + break; + case ':': + op = REOP_LPARENNON; + break; + } + parenIndex = state->parenCount; + } + else { + op = REOP_LPAREN; + /* LPAREN, , ... RPAREN, */ + state->progLength += 6; + parenIndex = state->parenCount++; + if (state->parenCount == 0) { + js_ReportCompileErrorNumber(JSMSG_TOO_MANY_PARENS, + errPos); + goto out; + } + } + goto pushOperator; + case '|': + case ')': + /* Expected an operand before these, so make an empty one */ + operand = NewRENode(state, REOP_EMPTY); + if (!operand) + goto out; + goto pushOperand; + default: + if (!parseTerm(state)) + goto out; + operand = state->result; +pushOperand: + if (operandSP == operandStackSize) { + operandStackSize += operandStackSize; + operandStack = + (RENode **)realloc(operandStack, + sizeof(RENode *) * operandStackSize); + if (!operandStack) + goto out; + } + operandStack[operandSP++] = operand; + break; + } + } + /* At the end; process remaining operators */ +restartOperator: + if (state->cp == state->cpend) { + while (operatorSP) { + --operatorSP; + if (!processOp(state, &operatorStack[operatorSP], + operandStack, operandSP)) + goto out; + --operandSP; + } + JS_ASSERT(operandSP == 1); + state->result = operandStack[0]; + result = JS_TRUE; + goto out; + } + switch (*state->cp) { + case '|': + /* Process any stacked 'concat' operators */ + ++state->cp; + while (operatorSP + && (operatorStack[operatorSP - 1].op == REOP_CONCAT)) { + --operatorSP; + if (!processOp(state, &operatorStack[operatorSP], + operandStack, operandSP)) + goto out; + --operandSP; + } + op = REOP_ALT; + goto pushOperator; + + case ')': + /* If there's not a stacked open parentheses,we + * accept the close as a flat. + */ + for (i = operatorSP - 1; i >= 0; i--) + if ((operatorStack[i].op == REOP_ASSERT) + || (operatorStack[i].op == REOP_ASSERT_NOT) + || (operatorStack[i].op == REOP_LPARENNON) + || (operatorStack[i].op == REOP_LPAREN)) + break; + if (i == -1) { + if (!parseTerm(state)) + goto out; + operand = state->result; + goto pushOperand; + } + ++state->cp; + /* process everything on the stack until the open */ + while (JS_TRUE) { + JS_ASSERT(operatorSP); + --operatorSP; + switch (operatorStack[operatorSP].op) { + case REOP_ASSERT: + case REOP_ASSERT_NOT: + case REOP_LPAREN: + operand = NewRENode(state, operatorStack[operatorSP].op); + if (!operand) + goto out; + operand->u.parenIndex + = operatorStack[operatorSP].parenIndex; + JS_ASSERT(operandSP); + operand->kid = operandStack[operandSP - 1]; + operandStack[operandSP - 1] = operand; + ++state->treeDepth; + /* fall thru... */ + case REOP_LPARENNON: + state->result = operandStack[operandSP - 1]; + if (!parseQuantifier(state)) + goto out; + operandStack[operandSP - 1] = state->result; + goto restartOperator; + default: + if (!processOp(state, &operatorStack[operatorSP], + operandStack, operandSP)) + goto out; + --operandSP; + break; + } + } + break; + default: + /* Anything else is the start of the next term */ + op = REOP_CONCAT; +pushOperator: + if (operatorSP == operatorStackSize) { + operatorStackSize += operatorStackSize; + operatorStack = + (REOpData *)realloc(operatorStack, + sizeof(REOpData) * operatorStackSize); + if (!operatorStack) + goto out; + } + operatorStack[operatorSP].op = op; + operatorStack[operatorSP].errPos = errPos; + operatorStack[operatorSP++].parenIndex = parenIndex; + break; + } + } +out: + if (operatorStack) + JS_free(state->context, operatorStack); + if (operandStack) + JS_free(state->context, operandStack); return result; } -REbool parseDisjunction(REState *parseState) -{ - if (!parseAlternative(parseState)) return RE_FALSE; - - if ((parseState->src != parseState->srcEnd) - && (*parseState->src == '|')) { - RENode *altResult; - ++parseState->src; - altResult = newRENode(parseState, REOP_ALT); - if (!altResult) return RE_FALSE; - altResult->child = parseState->result; - if (!parseDisjunction(parseState)) return RE_FALSE; - altResult->data.child2 = parseState->result; - parseState->result = altResult; - parseState->codeLength += 9; /* alt, , ..., goto, */ - } - return RE_TRUE; -} - /* - * Return a single REOP_Empty node for an empty Alternative, - * a single REOP_xxx node for a single term and a next field - * linked list for a list of terms for more than one term. - * Consecutive FLAT1 nodes get combined into a single FLATN + * Extract and return a decimal value at state->cp, the + * initial character 'c' has already been read. */ -REbool parseAlternative(REState *parseState) +static intN +getDecimalValue(jschar c, CompilerState *state) { - RENode *headTerm = NULL; - RENode *tailTerm = NULL; - while (RE_TRUE) { - if ((parseState->src == parseState->srcEnd) - || (*parseState->src == ')') - || (*parseState->src == '|')) { - if (!headTerm) { - parseState->result = newRENode(parseState, REOP_EMPTY); - if (!parseState->result) return RE_FALSE; - } - else - parseState->result = headTerm; - return RE_TRUE; - } - if (!parseTerm(parseState)) return RE_FALSE; - if (headTerm == NULL) - headTerm = parseState->result; - else { - if (tailTerm == NULL) { - if ((headTerm->kind == REOP_FLAT) - && (parseState->result->kind == headTerm->kind) - && (parseState->result->child - == (REchar *)(headTerm->child) - + headTerm->data.flat.length) ) { - headTerm->data.flat.length - += parseState->result->data.flat.length; - free(parseState->result); - } - else { - headTerm->next = parseState->result; - tailTerm = parseState->result; - while (tailTerm->next) tailTerm = tailTerm->next; - } - } - else { - if ((tailTerm->kind == REOP_FLAT) - && (parseState->result->kind == tailTerm->kind) - && (parseState->result->child - == (REchar *)(tailTerm->child) - + tailTerm->data.flat.length) ) { - tailTerm->data.flat.length - += parseState->result->data.flat.length; - free(parseState->result); - } - else { - tailTerm->next = parseState->result; - tailTerm = tailTerm->next; - while (tailTerm->next) tailTerm = tailTerm->next; - } - } - } - } -} - -static REint32 getDecimalValue(REchar c, REState *parseState) -{ - REint32 value = RE_UNDEC(c); - while (parseState->src < parseState->srcEnd) { - c = *parseState->src; - if (RE_ISDEC(c)) { - value = (10 * value) + RE_UNDEC(c); - ++parseState->src; - } - else + intN value = JS7_UNDEC(c); + while (state->cp < state->cpend) { + c = *state->cp; + if (!JS7_ISDEC(c)) break; + value = (10 * value) + JS7_UNDEC(c); + ++state->cp; } return value; } -/* calculate the total size of the bitmap required for a class expression */ - -static REbool calculateBitmapSize(REState *pState, RENode *target) - +/* + * Calculate the total size of the bitmap required for a class expression. + */ +static JSBool +calculateBitmapSize(CompilerState *state, RENode *target, const jschar *src, + const jschar *end) { + jschar rangeStart, c; + uintN n, digit, nDigits, i; + uintN max = 0; + JSBool inRange = JS_FALSE; - REchar rangeStart = 0; - const REchar *src = (const REchar *)(target->child); - const REchar *end = target->data.chclass.end; - - REchar c; - REuint32 nDigits; - REuint32 i; - REuint32 max = 0; - REbool inRange = RE_FALSE; - - target->data.chclass.length = 0; + target->u.ucclass.bmsize = 0; + target->u.ucclass.sense = JS_TRUE; if (src == end) - return RE_TRUE; + return JS_TRUE; - if (*src == '^') + if (*src == '^') { ++src; + target->u.ucclass.sense = JS_FALSE; + } while (src != end) { - REuint32 localMax = 0; + uintN localMax = 0; switch (*src) { case '\\': ++src; @@ -401,8 +563,8 @@ static REbool calculateBitmapSize(REState *pState, RENode *target) localMax = 0xB; break; case 'c': - if (((src + 1) < end) && RE_ISLETTER(src[1])) - localMax = (REchar)(*src++ & 0x1F); + if (((src + 1) < end) && RE_IS_LETTER(src[1])) + localMax = (jschar)(*src++ & 0x1F); else localMax = '\\'; break; @@ -412,28 +574,26 @@ static REbool calculateBitmapSize(REState *pState, RENode *target) case 'u': nDigits = 4; lexHex: - { - REuint32 n = 0; - for (i = 0; (i < nDigits) && (src < end); i++) { - REuint32 digit; - c = *src++; - if (!isASCIIHexDigit(c, &digit)) { - /* back off to accepting the original - *'\' as a literal - */ - src -= (i + 1); - n = '\\'; - break; - } - n = (n << 4) | digit; + n = 0; + for (i = 0; (i < nDigits) && (src < end); i++) { + c = *src++; + if (!isASCIIHexDigit(c, &digit)) { + /* + * Back off to accepting the original + *'\' as a literal. + */ + src -= (i + 1); + n = '\\'; + break; } - localMax = n; + n = (n << 4) | digit; } + localMax = n; break; case 'd': if (inRange) { - reportRegExpError(&pState->error, RE_WRONG_RANGE); - return RE_FALSE; + JS_ReportErrorNumber(JSMSG_BAD_CLASS_RANGE); + return JS_FALSE; } localMax = '9'; break; @@ -443,11 +603,43 @@ lexHex: case 'w': case 'W': if (inRange) { - reportRegExpError(&pState->error, RE_WRONG_RANGE); - return RE_FALSE; + JS_ReportErrorNumber(JSMSG_BAD_CLASS_RANGE); + return JS_FALSE; } - target->data.chclass.length = 65536; - return RE_TRUE; + target->u.ucclass.bmsize = 65535; + return JS_TRUE; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* + * This is a non-ECMA extension - decimal escapes (in this + * case, octal!) are supposed to be an error inside class + * ranges, but supported here for backwards compatibility. + * + */ + n = JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + n = 8 * n + JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + i = 8 * n + JS7_UNDEC(c); + if (i <= 0377) + n = i; + else + src--; + } + } + localMax = n; + break; + default: localMax = c; break; @@ -459,779 +651,937 @@ lexHex: } if (inRange) { if (rangeStart > localMax) { - reportRegExpError(&pState->error, RE_WRONG_RANGE); - return RE_FALSE; + JS_ReportErrorNumber(JSMSG_BAD_CLASS_RANGE); + return JS_FALSE; } - inRange = RE_FALSE; + inRange = JS_FALSE; } else { if (src < (end - 1)) { if (*src == '-') { ++src; - inRange = RE_TRUE; - rangeStart = (REchar)localMax; + inRange = JS_TRUE; + rangeStart = (jschar)localMax; continue; } } } - if (pState->flags & RE_IGNORECASE) { - c = canonicalize((REchar)localMax); + if (state->flags & JSREG_FOLD) { + c = canonicalize((jschar)localMax); if (c > localMax) localMax = c; } if (localMax > max) max = localMax; } - target->data.chclass.length = max + 1; - return RE_TRUE; + target->u.ucclass.bmsize = max; + return JS_TRUE; } - -REbool parseTerm(REState *parseState) +/* + * item: assertion An item is either an assertion or + * quantatom a quantified atom. + * + * assertion: '^' Assertions match beginning of string + * (or line if the class static property + * RegExp.multiline is true). + * '$' End of string (or line if the class + * static property RegExp.multiline is + * true). + * '\b' Word boundary (between \w and \W). + * '\B' Word non-boundary. + * + * quantatom: atom An unquantified atom. + * quantatom '{' n ',' m '}' + * Atom must occur between n and m times. + * quantatom '{' n ',' '}' Atom must occur at least n times. + * quantatom '{' n '}' Atom must occur exactly n times. + * quantatom '*' Zero or more times (same as {0,}). + * quantatom '+' One or more times (same as {1,}). + * quantatom '?' Zero or one time (same as {0,1}). + * + * any of which can be optionally followed by '?' for ungreedy + * + * atom: '(' regexp ')' A parenthesized regexp (what matched + * can be addressed using a backreference, + * see '\' n below). + * '.' Matches any char except '\n'. + * '[' classlist ']' A character class. + * '[' '^' classlist ']' A negated character class. + * '\f' Form Feed. + * '\n' Newline (Line Feed). + * '\r' Carriage Return. + * '\t' Horizontal Tab. + * '\v' Vertical Tab. + * '\d' A digit (same as [0-9]). + * '\D' A non-digit. + * '\w' A word character, [0-9a-z_A-Z]. + * '\W' A non-word character. + * '\s' A whitespace character, [ \b\f\n\r\t\v]. + * '\S' A non-whitespace character. + * '\' n A backreference to the nth (n decimal + * and positive) parenthesized expression. + * '\' octal An octal escape sequence (octal must be + * two or three digits long, unless it is + * 0 for the null character). + * '\x' hex A hex escape (hex must be two digits). + * '\u' unicode A unicode escape (must be four digits). + * '\c' ctrl A control character, ctrl is a letter. + * '\' literalatomchar Any character except one of the above + * that follow '\' in an atom. + * otheratomchar Any character not first among the other + * atom right-hand sides. + */ +static JSBool +parseTerm(CompilerState *state) { - REchar c = *parseState->src++; - REuint32 nDigits; - REuint32 parenBaseCount = parseState->parenCount; - REuint32 num, tmp; - RENode *term; - REchar *numStart; + jschar c = *state->cp++; + uintN nDigits; + uintN parenBaseCount = state->parenCount; + uintN num, tmp, n, i; + const jschar *termStart; + JSBool foundCachedCopy; switch (c) { /* assertions and atoms */ case '^': - parseState->result = newRENode(parseState, REOP_BOL); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; + state->result = NewRENode(state, REOP_BOL); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; case '$': - parseState->result = newRENode(parseState, REOP_EOL); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; + state->result = NewRENode(state, REOP_EOL); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; case '\\': - if (parseState->src < parseState->srcEnd) { - c = *parseState->src++; - switch (c) { - /* assertion escapes */ - case 'b' : - parseState->result = newRENode(parseState, REOP_WBND); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - case 'B': - parseState->result = newRENode(parseState, REOP_UNWBND); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - /* Decimal escape */ - case '0': - if (parseState->version == RE_VERSION_1) { - /* octal escape */ -doOctal: - num = 0; - while (parseState->src < parseState->srcEnd) { - c = *parseState->src; - if ((c >= '0') && (c <= '7')) { - parseState->src++; - tmp = 8 * num + (REuint32)RE_UNDEC(c); - if (tmp > 0377) - break; - num = tmp; - } - else - break; - } - parseState->result = newRENode(parseState, REOP_FLAT); - parseState->codeLength += 3; - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = (REchar)(num); - } - else { - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = 0; - parseState->codeLength += 3; - } - break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - numStart = parseState->src - 1; - num = (REuint32)getDecimalValue(c, parseState); - if (parseState->version == RE_VERSION_1) { - /* - * n in [8-9] and > count of parentheses, - * then revert to '8' or '9', ignoring the '\' - */ - if (((num == 8) || (num == 9)) - && (num > parseState->parenCount)) { - parseState->result = newRENode(parseState, REOP_FLAT); - parseState->codeLength += 3; - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = (REchar)(num + '0'); - } - /* - * more than 1 digit, or a number greater than - * the count of parentheses => it's an octal - */ - if (((parseState->src - numStart) > 1) - || (num > parseState->parenCount)) { - parseState->src = numStart; - goto doOctal; - } - parseState->result = newRENode(parseState, REOP_BACKREF); - if (!parseState->result) return RE_FALSE; - parseState->result->parenIndex = num - 1; - parseState->codeLength += 3; - } - else { - parseState->result = newRENode(parseState, REOP_BACKREF); - if (!parseState->result) return RE_FALSE; - parseState->result->parenIndex = num - 1; - parseState->codeLength += 3; - } - break; - /* Control escape */ - case 'f': - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = 0xC; - parseState->codeLength += 3; - break; - case 'n': - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = 0xA; - parseState->codeLength += 3; - break; - case 'r': - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = 0xD; - parseState->codeLength += 3; - break; - case 't': - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = 0x9; - parseState->codeLength += 3; - break; - case 'v': - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = 0xB; - parseState->codeLength += 3; - break; - /* Control letter */ - case 'c': - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - if (((parseState->src + 1) < parseState->srcEnd) && - RE_ISLETTER(parseState->src[1])) - parseState->result->data.flat.ch - = (REchar)(*parseState->src++ & 0x1F); - else { - /* back off to accepting the original '\' as a literal */ - --parseState->src; - parseState->result->data.flat.ch = '\\'; - parseState->result->child = (void *)parseState->src; - } - parseState->codeLength += 3; - break; - /* HexEscapeSequence */ - case 'x': - nDigits = 2; - goto lexHex; - /* UnicodeEscapeSequence */ - case 'u': - nDigits = 4; -lexHex: - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - { - REuint32 n = 0; - REuint32 i; - for (i = 0; (i < nDigits) - && (parseState->src < parseState->srcEnd); i++) { - REuint32 digit; - c = *parseState->src++; - if (!isASCIIHexDigit(c, &digit)) { - /* - * back off to accepting the original - * 'u' or 'x' as a literal - */ - parseState->src -= (i + 2); - n = *parseState->src++; - break; - } - n = (n << 4) | digit; - } - parseState->result->data.flat.ch = (REchar)(n); - } - parseState->codeLength += 3; - break; - /* Character class escapes */ - case 'd': - parseState->result = newRENode(parseState, REOP_DEC); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - case 'D': - parseState->result = newRENode(parseState, REOP_UNDEC); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - case 's': - parseState->result = newRENode(parseState, REOP_WS); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - case 'S': - parseState->result = newRENode(parseState, REOP_UNWS); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - case 'w': - parseState->result = newRENode(parseState, REOP_LETDIG); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - case 'W': - parseState->result = newRENode(parseState, REOP_UNLETDIG); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; - /* IdentityEscape */ - default: - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = c; - parseState->result->child = (void *)(parseState->src - 1); - parseState->codeLength += 3; - break; - } - break; - } - else { + if (state->cp >= state->cpend) { /* a trailing '\' is an error */ - reportRegExpError(&parseState->error, RE_TRAILING_SLASH); - return RE_FALSE; + js_ReportCompileErrorNumber(JSMSG_TRAILING_SLASH, state->cp); + return JS_FALSE; } - case '(': - { - RENode *result = NULL; - if ((*parseState->src == '?') - && ( (parseState->src[1] == '=') - || (parseState->src[1] == '!') - || (parseState->src[1] == ':') )) { - ++parseState->src; - switch (*parseState->src++) { - case '=': - result = newRENode(parseState, REOP_ASSERT); - if (!result) return RE_FALSE; - /* ASSERT, , ... ASSERTTEST */ - parseState->codeLength += 4; - break; - case '!': - result = newRENode(parseState, REOP_ASSERTNOT); - if (!result) return RE_FALSE; - /* ASSERTNOT, , ... ASSERTNOTTEST */ - parseState->codeLength += 4; + c = *state->cp++; + switch (c) { + /* assertion escapes */ + case 'b' : + state->result = NewRENode(state, REOP_WBDRY); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; + case 'B': + state->result = NewRENode(state, REOP_WNONBDRY); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; + /* Decimal escape */ + case '0': + if (JS_HAS_STRICT_OPTION(state->context)) + c = 0; + else { + doOctal: + num = 0; + while (state->cp < state->cpend) { + if ('0' <= (c = *state->cp) && c <= '7') { + state->cp++; + tmp = 8 * num + (uintN)JS7_UNDEC(c); + if (tmp > 0377) + break; + num = tmp; + } + else + break; + } + c = (jschar)(num); + } + doFlat: + state->result = NewRENode(state, REOP_FLAT); + if (!state->result) + return JS_FALSE; + state->result->u.flat.chr = c; + state->result->u.flat.length = 1; + state->progLength += 3; + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + termStart = state->cp - 1; + num = (uintN)getDecimalValue(c, state); + if (num > 9 && + num > state->parenCount && + !JS_HAS_STRICT_OPTION(state->context)) { + state->cp = termStart; + goto doOctal; + } + state->result = NewRENode(state, REOP_BACKREF); + if (!state->result) + return JS_FALSE; + state->result->u.parenIndex = num - 1; + state->progLength += 3; + break; + /* Control escape */ + case 'f': + c = 0xC; + goto doFlat; + case 'n': + c = 0xA; + goto doFlat; + case 'r': + c = 0xD; + goto doFlat; + case 't': + c = 0x9; + goto doFlat; + case 'v': + c = 0xB; + goto doFlat; + /* Control letter */ + case 'c': + if (((state->cp + 1) < state->cpend) && + RE_IS_LETTER(state->cp[1])) + c = (jschar)(*state->cp++ & 0x1F); + else { + /* back off to accepting the original '\' as a literal */ + --state->cp; + c = '\\'; + } + goto doFlat; + /* HexEscapeSequence */ + case 'x': + nDigits = 2; + goto lexHex; + /* UnicodeEscapeSequence */ + case 'u': + nDigits = 4; +lexHex: + n = 0; + for (i = 0; (i < nDigits) + && (state->cp < state->cpend); i++) { + uintN digit; + c = *state->cp++; + if (!isASCIIHexDigit(c, &digit)) { + /* + * back off to accepting the original + * 'u' or 'x' as a literal + */ + state->cp -= (i + 2); + n = *state->cp++; break; } + n = (n << 4) | digit; } - else { - result = newRENode(parseState, REOP_PAREN); - /* PAREN, , ... CLOSEPAREN, */ - parseState->codeLength += 6; - if (!result) return RE_FALSE; - result->parenIndex = parseState->parenCount++; - } - if (!parseDisjunction(parseState)) return RE_FALSE; - if ((parseState->src == parseState->srcEnd) - || (*parseState->src != ')')) { - reportRegExpError(&parseState->error, RE_UNCLOSED_PAREN); - return RE_FALSE; - } - else { - ++parseState->src; - } - if (result) { - result->child = parseState->result; - parseState->result = result; - } + c = (jschar)(n); + goto doFlat; + /* Character class escapes */ + case 'd': + state->result = NewRENode(state, REOP_DIGIT); +doSimple: + if (!state->result) + return JS_FALSE; + state->progLength++; + break; + case 'D': + state->result = NewRENode(state, REOP_NONDIGIT); + goto doSimple; + case 's': + state->result = NewRENode(state, REOP_SPACE); + goto doSimple; + case 'S': + state->result = NewRENode(state, REOP_NONSPACE); + goto doSimple; + case 'w': + state->result = NewRENode(state, REOP_ALNUM); + goto doSimple; + case 'W': + state->result = NewRENode(state, REOP_NONALNUM); + goto doSimple; + /* IdentityEscape */ + default: + state->result = NewRENode(state, REOP_FLAT); + if (!state->result) + return JS_FALSE; + state->result->u.flat.chr = c; + state->result->u.flat.length = 1; + state->result->kid = (void *)(state->cp - 1); + state->progLength += 3; break; } + break; case '[': - parseState->result = newRENode(parseState, REOP_CLASS); - if (!parseState->result) return RE_FALSE; - parseState->result->child = (void *)(parseState->src); - while (RE_TRUE) { - if (parseState->src == parseState->srcEnd) { - reportRegExpError(&parseState->error, RE_UNCLOSED_CLASS); - return RE_FALSE; - } - if (*parseState->src == '\\') { - ++parseState->src; - if (RE_ISDEC(*parseState->src)) { - reportRegExpError(&parseState->error, RE_BACKREF_IN_CLASS); - return RE_FALSE; - } + state->result = NewRENode(state, REOP_CLASS); + if (!state->result) + return JS_FALSE; + termStart = state->cp; + state->result->u.ucclass.startIndex = termStart - state->cpbegin; + while (JS_TRUE) { + if (state->cp == state->cpend) { + js_ReportCompileErrorNumber(JSMSG_UNTERM_CLASS, termStart); + return JS_FALSE; } + if (*state->cp == '\\') + state->cp++; else { - if (*parseState->src == ']') { - parseState->result->data.chclass.end = parseState->src++; + if (*state->cp == ']') { + state->result->u.ucclass.kidlen = state->cp - termStart; break; } } - ++parseState->src; + state->cp++; } - parseState->result->data.chclass.classIndex = (int32)parseState->classCount++; - /* Call calculateBitmapSize now as we want any errors it finds - to be reported during the parse phase, not at execution */ - if (!calculateBitmapSize(parseState, parseState->result)) - return RE_FALSE; - parseState->codeLength += 3; /* CLASS, */ + foundCachedCopy = JS_FALSE; + for (i = 0; i < CLASS_CACHE_SIZE; i++) { + if (state->classCache[i].start) { + if (state->classCache[i].length == state->result->u.ucclass.kidlen) { + foundCachedCopy = JS_TRUE; + for (n = 0; n < state->classCache[i].length; n++) { + if (state->classCache[i].start[n] != termStart[n]) { + foundCachedCopy = JS_FALSE; + break; + } + } + if (foundCachedCopy) { + state->result->u.ucclass.index = state->classCache[i].index; + break; + } + } + } + else { + state->classCache[i].start = termStart; + state->classCache[i].length = state->result->u.ucclass.kidlen; + state->classCache[i].index = state->classCount; + break; + } + } + if (!foundCachedCopy) + state->result->u.ucclass.index = state->classCount++; + /* + * Call calculateBitmapSize now as we want any errors it finds + * to be reported during the parse phase, not at execution. + */ + if (!calculateBitmapSize(state, state->result, termStart, state->cp++)) + return JS_FALSE; + state->progLength += 3; /* CLASS, */ break; case '.': - parseState->result = newRENode(parseState, REOP_DOT); - if (!parseState->result) return RE_FALSE; - parseState->codeLength++; - break; + state->result = NewRENode(state, REOP_DOT); + goto doSimple; + case '*': + case '+': + case '?': + js_ReportCompileErrorNumber(JSMSG_BAD_QUANTIFIER, state->cp - 1); + return JS_FALSE; default: - parseState->result = newRENode(parseState, REOP_FLAT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.flat.ch = c; - parseState->result->child = (void *)(parseState->src - 1); - parseState->codeLength += 3; + state->result = NewRENode(state, REOP_FLAT); + if (!state->result) + return JS_FALSE; + state->result->u.flat.chr = c; + state->result->u.flat.length = 1; + state->result->kid = (void *)(state->cp - 1); + state->progLength += 3; break; } + return parseQuantifier(state); +} - term = parseState->result; - if (parseState->src < parseState->srcEnd) { - switch (*parseState->src) { +static JSBool +parseQuantifier(CompilerState *state) +{ + RENode *term; + term = state->result; + if (state->cp < state->cpend) { + switch (*state->cp) { case '+': - parseState->result = newRENode(parseState, REOP_QUANT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.quantifier.min = 1; - parseState->result->data.quantifier.max = -1; - /* , , , ... */ - parseState->codeLength += 8; + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + state->result->u.range.min = 1; + state->result->u.range.max = -1; + /* , ... */ + state->progLength += 4; goto quantifier; case '*': - parseState->result = newRENode(parseState, REOP_QUANT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.quantifier.min = 0; - parseState->result->data.quantifier.max = -1; - /* , , , ... */ - parseState->codeLength += 8; + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + state->result->u.range.min = 0; + state->result->u.range.max = -1; + /* , ... */ + state->progLength += 4; goto quantifier; case '?': - parseState->result = newRENode(parseState, REOP_QUANT); - if (!parseState->result) return RE_FALSE; - parseState->result->data.quantifier.min = 0; - parseState->result->data.quantifier.max = 1; - /* , , , ... */ - parseState->codeLength += 8; + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + state->result->u.range.min = 0; + state->result->u.range.max = 1; + /* , ... */ + state->progLength += 4; goto quantifier; - case '{': + case '{': /* balance '}' */ { - REint32 min = 0; - REint32 max = -1; - REchar c; - ++parseState->src; + intN err; + intN min = 0; + intN max = -1; + jschar c; + const jschar *errp = state->cp++; - parseState->result = newRENode(parseState, REOP_QUANT); - if (!parseState->result) return RE_FALSE; + c = *state->cp; + if (JS7_ISDEC(c)) { + ++state->cp; + min = getDecimalValue(c, state); + c = *state->cp; + } + else { + /* For Perl etc. compatibility, if a curly is not + * followed by a proper digit, back off from it + * being a quantifier, and chew it up as a literal + * atom next time instead. + */ + --state->cp; + return JS_TRUE; + } + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; - c = *parseState->src; - if (RE_ISDEC(c)) { - ++parseState->src; - min = getDecimalValue(c, parseState); - c = *parseState->src; + if (min >> 16) { + err = JSMSG_MIN_TOO_BIG; + goto quantError; } if (c == ',') { - c = *++parseState->src; - if (RE_ISDEC(c)) { - ++parseState->src; - max = getDecimalValue(c, parseState); - c = *parseState->src; + c = *++state->cp; + if (JS7_ISDEC(c)) { + ++state->cp; + max = getDecimalValue(c, state); + c = *state->cp; + if (max >> 16) { + err = JSMSG_MAX_TOO_BIG; + goto quantError; + } + if (min > max) { + err = JSMSG_OUT_OF_ORDER; + goto quantError; + } } } - else + else { max = min; - parseState->result->data.quantifier.min = min; - parseState->result->data.quantifier.max = max; - /* QUANT, , , , - , ... */ - parseState->codeLength += 12; + } + state->result->u.range.min = min; + state->result->u.range.max = max; + /* QUANT, , , ... */ + state->progLength += 8; + /* balance '{' */ if (c == '}') goto quantifier; else { - reportRegExpError(&parseState->error, RE_UNCLOSED_BRACKET); - return RE_FALSE; + err = JSMSG_UNTERM_QUANTIFIER; +quantError: + js_ReportCompileErrorNumber(err, errp); + return JS_FALSE; } } } } - return RE_TRUE; + return JS_TRUE; quantifier: - ++parseState->src; - parseState->result->child = term; - parseState->result->parenIndex = parenBaseCount; - parseState->result->data.quantifier.parenCount - = (REint32)(parseState->parenCount - parenBaseCount); - if ((parseState->src < parseState->srcEnd) && (*parseState->src == '?')) { - ++parseState->src; - parseState->result->data.quantifier.greedy = RE_FALSE; + ++state->treeDepth; + ++state->cp; + state->result->kid = term; + if ((state->cp < state->cpend) && (*state->cp == '?')) { + ++state->cp; + state->result->u.range.greedy = JS_FALSE; } else - parseState->result->data.quantifier.greedy = RE_TRUE; - return RE_TRUE; + state->result->u.range.greedy = JS_TRUE; + return JS_TRUE; } - - - - +#define CHECK_OFFSET(diff) (JS_ASSERT(((diff) >= -32768) && ((diff) <= 32767))) +#define SET_OFFSET(pc,off) ((pc)[0] = JUMP_OFFSET_HI(off), \ + (pc)[1] = JUMP_OFFSET_LO(off)) +#define GET_OFFSET(pc) ((int16)(((pc)[0] << 8) | (pc)[1])) +#define OFFSET_LEN (2) +#define GET_ARG(pc) GET_OFFSET(pc) +#define SET_ARG(pc,arg) SET_OFFSET(pc,arg) +#define ARG_LEN OFFSET_LEN /* -1. Let e be x's endIndex. -2. If e is zero, return RE_TRUE. -3. If Multiline is RE_FALSE, return RE_FALSE. -4. If the character Input[e-1] is one of the line terminator characters , - , , or , return RE_TRUE. -5. Return RE_FALSE. -*/ -static REMatchState *bolMatcher(REGlobalData *gData, REMatchState *x) + * Recursively generate bytecode for the tree rooted at t. Iteratively. + */ + +typedef struct { + RENode *nextAlt; + jsbytecode *nextAltFixup, *nextTermFixup, *endTermFixup; + RENode *continueNode; + REOp continueOp; +} EmitStateStackEntry; + +static jsbytecode * +emitREBytecode(CompilerState *state, JSRegExp *re, intN treeDepth, + jsbytecode *pc, RENode *t) { - REuint32 e = (REuint32)x->endIndex; - if (e != 0) { - if (gData->globalMultiline || - (gData->regexp->flags & RE_MULTILINE)) { - if (!RE_ISLINETERM(gData->input[e - 1])) - return NULL; - } - else + ptrdiff_t diff; + RECharSet *charSet; + EmitStateStackEntry *emitStateSP, *emitStateStack = NULL; + REOp op; + + if (treeDepth) { + emitStateStack = + (EmitStateStackEntry *)JS_malloc(state->context, + sizeof(EmitStateStackEntry) + * treeDepth); + if (!emitStateStack) return NULL; } - return x; -} + emitStateSP = emitStateStack; + op = t->op; -/* -1. Let e be x's endIndex. -2. If e is equal to InputLength, return RE_TRUE. -3. If multiline is RE_FALSE, return RE_FALSE. -4. If the character Input[e] is one of the line terminator characters , - , , or , return RE_TRUE. -5. Return RE_FALSE. -*/ -static REMatchState *eolMatcher(REGlobalData *gData, REMatchState *x) -{ - REint32 e = x->endIndex; - if (e != gData->length) { - if (gData->globalMultiline || - (gData->regexp->flags & RE_MULTILINE)) { - if (!RE_ISLINETERM(gData->input[e])) - return NULL; + while (JS_TRUE) { + *pc++ = op; + switch (op) { + case REOP_EMPTY: + --pc; + break; + + case REOP_ALTPREREQ2: + case REOP_ALTPREREQ: + JS_ASSERT(emitStateSP); + emitStateSP->endTermFixup = pc; + pc += OFFSET_LEN; + SET_ARG(pc, t->u.altprereq.ch1); + pc += ARG_LEN; + SET_ARG(pc, t->u.altprereq.ch2); + pc += ARG_LEN; + + emitStateSP->nextAltFixup = pc; /* address of next alternate */ + pc += OFFSET_LEN; + + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_JUMP; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + + case REOP_JUMP: + emitStateSP->nextTermFixup = pc; /* address of following term */ + pc += OFFSET_LEN; + diff = pc - emitStateSP->nextAltFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextAltFixup, diff); + emitStateSP->continueOp = REOP_ENDALT; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->u.kid2); + op = t->op; + continue; + + case REOP_ENDALT: + diff = pc - emitStateSP->nextTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextTermFixup, diff); + if (t->op != REOP_ALT) { + diff = pc - emitStateSP->endTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->endTermFixup, diff); + } + break; + + case REOP_ALT: + JS_ASSERT(emitStateSP); + emitStateSP->nextAltFixup = pc; /* address of pointer to next alternate */ + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_JUMP; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + + case REOP_FLAT: + /* + * Consecutize FLAT's if possible. + */ + if (t->kid) { + while (t->next && (t->next->op == REOP_FLAT) + && (((jschar*)(t->kid) + t->u.flat.length) + == (jschar*)(t->next->kid))) { + t->u.flat.length += t->next->u.flat.length; + t->next = t->next->next; + } + } + if (t->kid && (t->u.flat.length > 1)) { + if (state->flags & JSREG_FOLD) + pc[-1] = REOP_FLATi; + else + pc[-1] = REOP_FLAT; + SET_ARG(pc, (jschar *)(t->kid) - state->cpbegin); + pc += ARG_LEN; + SET_ARG(pc, t->u.flat.length); + pc += ARG_LEN; + } + else { + if (t->u.flat.chr < 256) { + if (state->flags & JSREG_FOLD) + pc[-1] = REOP_FLAT1i; + else + pc[-1] = REOP_FLAT1; + *pc++ = (jsbytecode)(t->u.flat.chr); + } + else { + if (state->flags & JSREG_FOLD) + pc[-1] = REOP_UCFLAT1i; + else + pc[-1] = REOP_UCFLAT1; + SET_ARG(pc, t->u.flat.chr); + pc += ARG_LEN; + } + } + break; + + case REOP_LPAREN: + JS_ASSERT(emitStateSP); + SET_ARG(pc, t->u.parenIndex); + pc += ARG_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_RPAREN; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_RPAREN: + SET_ARG(pc, t->u.parenIndex); + pc += ARG_LEN; + break; + + case REOP_BACKREF: + SET_ARG(pc, t->u.parenIndex); + pc += ARG_LEN; + break; + case REOP_ASSERT: + JS_ASSERT(emitStateSP); + emitStateSP->nextTermFixup = pc; + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_ASSERTTEST; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_ASSERTTEST: + case REOP_ASSERTNOTTEST: + diff = pc - emitStateSP->nextTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextTermFixup, diff); + break; + case REOP_ASSERT_NOT: + JS_ASSERT(emitStateSP); + emitStateSP->nextTermFixup = pc; + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_ASSERTNOTTEST; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_QUANT: + JS_ASSERT(emitStateSP); + if ((t->u.range.min == 0) && (t->u.range.max == (uint16)(-1))) + pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR; + else + if ((t->u.range.min == 0) && (t->u.range.max == 1)) + pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT; + else + if ((t->u.range.min == 1) && (t->u.range.max == (uint16)(-1))) + pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; + else { + if (!t->u.range.greedy) pc[-1] = REOP_MINIMALQUANT; + SET_ARG(pc, t->u.range.min); + pc += ARG_LEN; + SET_ARG(pc, t->u.range.max); + pc += ARG_LEN; + } + emitStateSP->nextTermFixup = pc; + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_ENDCHILD; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_ENDCHILD: + diff = pc - emitStateSP->nextTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextTermFixup, diff); + break; + case REOP_CLASS: + if (!t->u.ucclass.sense) + pc[-1] = REOP_NCLASS; + SET_ARG(pc, t->u.ucclass.index); + pc += ARG_LEN; + charSet = &re->classList[t->u.ucclass.index]; + charSet->converted = JS_FALSE; + charSet->length = t->u.ucclass.bmsize; + charSet->u.src.startIndex = t->u.ucclass.startIndex; + charSet->u.src.length = t->u.ucclass.kidlen; + charSet->sense = t->u.ucclass.sense; + break; + default: + break; + } + t = t->next; + if (t == NULL) { + if (emitStateSP == emitStateStack) + break; + --emitStateSP; + t = emitStateSP->continueNode; + op = emitStateSP->continueOp; } else - return NULL; + op = t->op; } - return x; -} - - -/* -1. If e == -1 or e == InputLength, return RE_FALSE. -2. Let c be the character Input[e]. -3. If c is one of the sixty-three characters in the table below, return RE_TRUE. -a b c d e f g h i j k l m n o p q r s t u v w x y z -A B C D E F G H I J K L M N O P Q R S T U V W X Y Z -0 1 2 3 4 5 6 7 8 9 _ -4. Return RE_FALSE. -*/ -static REbool isWordChar(REint32 e, REGlobalData *gData) -{ - REchar c; - if ((e == -1) || (e == (REint32)(gData->length))) - return RE_FALSE; - c = gData->input[e]; - if (RE_ISLETDIG(c) || (c == '_')) - return RE_TRUE; - return RE_FALSE; + if (emitStateStack) + JS_free(state->context, emitStateStack); + return pc; } /* -1. Let e be x's endIndex. -2. Call IsWordChar(e-1) and let a be the boolean result. -3. Call IsWordChar(e) and let b be the boolean result. -for '\b' -4. If a is RE_TRUE and b is RE_FALSE, return RE_TRUE. -5. If a is RE_FALSE and b is RE_TRUE, return RE_TRUE. -6. Return RE_FALSE. - -for '\B' -4. If a is RE_TRUE and b is RE_FALSE, return RE_FALSE. -5. If a is RE_FALSE and b is RE_TRUE, return RE_FALSE. -6. Return RE_TRUE. -*/ -static REMatchState *wbndMatcher(REGlobalData *gData, REMatchState *x, REbool sense) + * Save the current state of the match - the position in the input + * text as well as the position in the bytecode. The state of any + * parent expressions is also saved (preceding state). + * Contents of parenCount parentheses from parenIndex are also saved. + */ +static REBackTrackData * +pushBackTrackState(REGlobalData *gData, REOp op, + jsbytecode *target, REMatchState *x, const jschar *cp, + intN parenIndex, intN parenCount) { - REint32 e = (REint32)(x->endIndex); + intN i; + REBackTrackData *result + = (REBackTrackData *)((char *)(gData->backTrackSP) + gData->cursz); - REbool a = isWordChar(e - 1, gData); - REbool b = isWordChar(e, gData); + size_t sz = sizeof(REBackTrackData) + + gData->stateStackTop * sizeof(REProgState) + + parenCount * sizeof(RECapture); + + + if (((char *)result + sz) + > (char *)gData->backTrackStack + gData->maxBackTrack) { + ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack; + gData->backTrackStack + = (REBackTrackData *)realloc(gData->backTrackStack, + gData->maxBackTrack + + gData->maxBackTrack); + gData->maxBackTrack <<= 1; + if (!gData->backTrackStack) + return NULL; + result = (REBackTrackData *)((char *)gData->backTrackStack + offset); + } + gData->backTrackSP = result; + result->sz = gData->cursz; + gData->cursz = sz; + + result->backtrack_op = op; + result->backtrack_pc = target; + result->cp = cp; + result->parenCount = parenCount; + + result->precedingStateTop = gData->stateStackTop; + JS_ASSERT(gData->stateStackTop); + memcpy(result + 1, gData->stateStack, + sizeof(REProgState) * result->precedingStateTop); - if (sense) { - if ((a && !b) || (!a && b)) - return x; - else - return NULL; - } - else { - if ((a && !b) || (!a && b)) - return NULL; - else - return x; + if (parenCount != -1) { + result->parenIndex = parenIndex; + memcpy((char *)(result + 1) + + sizeof(REProgState) * result->precedingStateTop, + &x->parens[parenIndex], + sizeof(RECapture) * parenCount); + for (i = 0; i < parenCount; i++) + x->parens[parenIndex + i].index = -1; } + + return result; } + /* -1. Let A be the set of all characters except the four line terminator - characters , , , or . -2. Call CharacterSetMatcher(A, RE_FALSE) and return its Matcher result. -*/ -static REMatchState *dotMatcher(REGlobalData *gData, REMatchState *x) + * Consecutive literal characters. + */ +static REMatchState * +flatNMatcher(REGlobalData *gData, REMatchState *x, const jschar *matchChars, + intN length) { - REchar ch; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - ch = gData->input[e]; - if (RE_ISLINETERM(ch)) - return NULL; - x->endIndex++; - return x; -} - -/* - \d evaluates by returning the ten-element set of characters containing the - characters 0 through 9 inclusive. - \D evaluates by returning the set of all characters not included in the set - returned by \d. -*/ -static REMatchState *decMatcher(REGlobalData *gData, REMatchState *x, REbool sense) -{ - REchar ch; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - ch = gData->input[e]; - if (RE_ISDEC(ch) != sense) - return NULL; - x->endIndex++; - return x; -} - -/* - \s evaluates by returning the set of characters containing - the characters that are on the right-hand side of the WhiteSpace - (section 7.2) or LineTerminator (section 7.3) productions. - \S evaluates by returning the set of all characters not - included in the set returned by \s. -*/ -static REMatchState *wsMatcher(REGlobalData *gData, REMatchState *x, REbool sense) -{ - REchar ch; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - ch = gData->input[e]; - if (RE_ISSPACE(ch) != sense) - return NULL; - x->endIndex++; - return x; -} - -/* - \w evaluates by returning the set of characters containing the sixty-three - characters: - a b c d e f g h i j k l m n o p q r s t u v w x y z - A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - 0 1 2 3 4 5 6 7 8 9 _ - \W evaluates by returning the set of all characters not included in the set - returned by \w. -*/ -static REMatchState *letdigMatcher(REGlobalData *gData, REMatchState *x, REbool sense) -{ - REchar ch; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - ch = gData->input[e]; - if ((RE_ISLETDIG(ch) || (ch == '_')) != sense) - return NULL; - x->endIndex++; - return x; -} - -/* -1. Return an internal Matcher closure that takes two arguments, a State x -and a Continuation c, and performs the following: - 1. Let e be x's endIndex. - 2. If e == InputLength, return failure. - 3. Let c be the character Input[e]. - 4. Let cc be the result of Canonicalize(c). - 5. If invert is RE_TRUE, go to step 8. - 6. If there does not exist a member a of set A such that Canonicalize(a) - == cc, then return failure. - 7. Go to step 9. - 8. If there exists a member a of set A such that Canonicalize(a) == cc, - then return failure. - 9. Let cap be x's captures internal array. - 10. Let y be the State (e+1, cap). - 11. Call c(y) and return its result. -*/ -static REMatchState *flatMatcher(REGlobalData *gData, REMatchState *x, REchar matchCh) -{ - REchar ch; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - ch = gData->input[e]; - - if (ch != matchCh) - return NULL; - x->endIndex++; - return x; -} - -static REMatchState *flatIMatcher(REGlobalData *gData, REMatchState *x, REchar matchCh) -{ - REchar ch; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - ch = gData->input[e]; - - if (canonicalize(ch) != canonicalize(matchCh)) - return NULL; - x->endIndex++; - return x; -} - -/* - Consecutive literal characters. -*/ -static REMatchState *flatNMatcher(REGlobalData *gData, REMatchState *x, - REchar *matchChars, REint32 length) -{ - REint32 e = x->endIndex; - REint32 i; - if ((e + length) > gData->length) + intN i; + if ((x->cp + length) > gData->cpend) return NULL; for (i = 0; i < length; i++) { - if (matchChars[i] != gData->input[e + i]) + if (matchChars[i] != x->cp[i]) return NULL; } - x->endIndex += length; + x->cp += length; return x; } -static REMatchState *flatNIMatcher(REGlobalData *gData, REMatchState *x, - REchar *matchChars, REint32 length) +static REMatchState * +flatNIMatcher(REGlobalData *gData, REMatchState *x, const jschar *matchChars, + intN length) { - REint32 e = x->endIndex; - REint32 i; - if ((e + length) > gData->length) + intN i; + if ((x->cp + length) > gData->cpend) return NULL; for (i = 0; i < length; i++) { if (canonicalize(matchChars[i]) - != canonicalize(gData->input[e + i])) + != canonicalize(x->cp[i])) return NULL; } - x->endIndex += length; + x->cp += length; return x; } -/* Add a single character to the RECharSet */ - -static void addCharacterToCharSet(RECharSet *cs, REchar c) - +/* + * 1. Evaluate DecimalEscape to obtain an EscapeValue E. + * 2. If E is not a character then go to step 6. + * 3. Let ch be E's character. + * 4. Let A be a one-element RECharSet containing the character ch. + * 5. Call CharacterSetMatcher(A, false) and return its Matcher result. + * 6. E must be an integer. Let n be that integer. + * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. + * 8. Return an internal Matcher closure that takes two arguments, a State x + * and a Continuation c, and performs the following: + * 1. Let cap be x's captures internal array. + * 2. Let s be cap[n]. + * 3. If s is undefined, then call c(x) and return its result. + * 4. Let e be x's endIndex. + * 5. Let len be s's length. + * 6. Let f be e+len. + * 7. If f>InputLength, return failure. + * 8. If there exists an integer i between 0 (inclusive) and len (exclusive) + * such that Canonicalize(s[i]) is not the same character as + * Canonicalize(Input [e+i]), then return failure. + * 9. Let y be the State (f, cap). + * 10. Call c(y) and return its result. + */ +static REMatchState * +backrefMatcher(REGlobalData *gData, REMatchState *x, uintN parenIndex) { - REuint32 byteIndex = (REuint32)(c / 8); - ASSERT(c < cs->length); - cs->bits[byteIndex] |= 1 << (c & 0x7); + uintN len; + uintN i; + const jschar *parenContent; + RECapture *s = &x->parens[parenIndex]; + if (s->index == -1) + return x; + + len = s->length; + if ((x->cp + len) > gData->cpend) + return NULL; + + parenContent = &gData->cpbegin[s->index]; + if (gData->regexp->flags & JSREG_FOLD) { + for (i = 0; i < len; i++) { + if (canonicalize(parenContent[i]) + != canonicalize(x->cp[i])) + return NULL; + } + } + else { + for (i = 0; i < len; i++) { + if (parenContent[i] != x->cp[i]) + return NULL; + } + } + x->cp += len; + return x; +} + + +/* Add a single character to the RECharSet */ +static void +addCharacterToCharSet(RECharSet *cs, jschar c) +{ + uintN byteIndex = (uintN)(c / 8); + JS_ASSERT(c <= cs->length); + cs->u.bits[byteIndex] |= 1 << (c & 0x7); } /* Add a character range, c1 to c2 (inclusive) to the RECharSet */ - -static void addCharacterRangeToCharSet(RECharSet *cs, REchar c1, REchar c2) - +static void +addCharacterRangeToCharSet(RECharSet *cs, jschar c1, jschar c2) { - REuint32 i; + uintN i; - REuint32 byteIndex1 = (REuint32)(c1 / 8); - REuint32 byteIndex2 = (REuint32)(c2 / 8); + uintN byteIndex1 = (uintN)(c1 / 8); + uintN byteIndex2 = (uintN)(c2 / 8); - ASSERT((c2 <= cs->length) && (c1 <= c2)); + JS_ASSERT((c2 <= cs->length) && (c1 <= c2)); c1 &= 0x7; c2 &= 0x7; - if (byteIndex1 == byteIndex2) { - cs->bits[byteIndex1] |= ((REuint8)(0xFF) >> (7 - (c2 - c1))) << c1; - } + if (byteIndex1 == byteIndex2) + cs->u.bits[byteIndex1] |= ((uint8)(0xFF) >> (7 - (c2 - c1))) << c1; else { - cs->bits[byteIndex1] |= 0xFF << c1; + cs->u.bits[byteIndex1] |= 0xFF << c1; for (i = byteIndex1 + 1; i < byteIndex2; i++) - cs->bits[i] = 0xFF; - cs->bits[byteIndex2] |= (REuint8)(0xFF) >> (7 - c2); + cs->u.bits[i] = 0xFF; + cs->u.bits[byteIndex2] |= (uint8)(0xFF) >> (7 - c2); } } - /* Compile the source of the class into a RECharSet */ - -static REbool processCharSet(REState *pState, RENode *target) +static JSBool +processCharSet(REGlobalData *gData, RECharSet *charSet) { - REchar rangeStart = 0, thisCh; - const REchar *src = (const REchar *)(target->child); - const REchar *end = target->data.chclass.end; + const jschar *src = JSSTRING_CHARS(gData->regexp->source) + + charSet->u.src.startIndex; + const jschar *end = src + charSet->u.src.length; - REuint32 byteLength; - REchar c; - REint32 nDigits; - REint32 i; - REbool inRange = RE_FALSE; + jschar rangeStart, thisCh; + uintN byteLength; + jschar c; + uintN n; + intN nDigits; + intN i; + JSBool inRange = JS_FALSE; + + JS_ASSERT(!charSet->converted); + charSet->converted = JS_TRUE; - RECharSet *charSet = &pState->classList[target->data.chclass.classIndex]; - charSet->length = target->data.chclass.length; - charSet->sense = RE_TRUE; byteLength = (charSet->length / 8) + 1; - charSet->bits = (REuint8 *)malloc(byteLength); - if (!charSet->bits) - return RE_FALSE; - memset(charSet->bits, 0, byteLength); + charSet->u.bits = (uint8 *)JS_malloc(gData->cx, byteLength); + if (!charSet->u.bits) + return JS_FALSE; + memset(charSet->u.bits, 0, byteLength); - if (src == end) { - return RE_TRUE; - } + if (src == end) + return JS_TRUE; if (*src == '^') { - charSet->sense = RE_FALSE; + JS_ASSERT(charSet->sense == JS_FALSE); ++src; } + else + JS_ASSERT(charSet->sense == JS_TRUE); + while (src != end) { switch (*src) { @@ -1244,7 +1594,6 @@ static REbool processCharSet(REState *pState, RENode *target) break; case 'f': thisCh = 0xC; - addCharacterToCharSet(charSet, 0xC); break; case 'n': thisCh = 0xA; @@ -1259,8 +1608,8 @@ static REbool processCharSet(REState *pState, RENode *target) thisCh = 0xB; break; case 'c': - if (((src + 1) < end) && RE_ISLETTER(src[1])) - thisCh = (REchar)(*src++ & 0x1F); + if (((src + 1) < end) && JS_ISWORD(src[1])) + thisCh = (jschar)(*src++ & 0x1F); else { --src; thisCh = '\\'; @@ -1272,51 +1621,82 @@ static REbool processCharSet(REState *pState, RENode *target) case 'u': nDigits = 4; lexHex: - { - REuint32 n = 0; - for (i = 0; (i < nDigits) && (src < end); i++) { - REuint32 digit; - c = *src++; - if (!isASCIIHexDigit(c, &digit)) { - /* back off to accepting the original '\' - * as a literal - */ - src -= (i + 1); - n = '\\'; - break; - } - n = (n << 4) | digit; + n = 0; + for (i = 0; (i < nDigits) && (src < end); i++) { + uintN digit; + c = *src++; + if (!isASCIIHexDigit(c, &digit)) { + /* + * Back off to accepting the original '\' + * as a literal + */ + src -= (i + 1); + n = '\\'; + break; } - thisCh = (REchar)(n); + n = (n << 4) | digit; } + thisCh = (jschar)(n); break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* + * This is a non-ECMA extension - decimal escapes (in this + * case, octal!) are supposed to be an error inside class + * ranges, but supported here for backwards compatibility. + * + */ + n = JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + n = 8 * n + JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + i = 8 * n + JS7_UNDEC(c); + if (i <= 0377) + n = i; + else + src--; + } + } + thisCh = (jschar)(n); + break; + case 'd': addCharacterRangeToCharSet(charSet, '0', '9'); continue; /* don't need range processing */ case 'D': addCharacterRangeToCharSet(charSet, 0, '0' - 1); - addCharacterRangeToCharSet(charSet, (REchar)('9' + 1), - (REchar)(charSet->length - 1)); + addCharacterRangeToCharSet(charSet, (jschar)('9' + 1), + (jschar)(charSet->length)); continue; case 's': - for (i = (REint32)(charSet->length - 1); i >= 0; i--) - if (RE_ISSPACE(i)) - addCharacterToCharSet(charSet, (REchar)(i)); + for (i = (intN)(charSet->length); i >= 0; i--) + if (JS_ISSPACE(i)) + addCharacterToCharSet(charSet, (jschar)(i)); continue; case 'S': - for (i = (REint32)(charSet->length - 1); i >= 0; i--) - if (!RE_ISSPACE(i)) - addCharacterToCharSet(charSet, (REchar)(i)); + for (i = (intN)(charSet->length); i >= 0; i--) + if (!JS_ISSPACE(i)) + addCharacterToCharSet(charSet, (jschar)(i)); continue; case 'w': - for (i = (REint32)(charSet->length - 1); i >= 0; i--) - if (RE_ISLETDIG(i)) - addCharacterToCharSet(charSet, (REchar)(i)); + for (i = (intN)(charSet->length); i >= 0; i--) + if (JS_ISWORD(i)) + addCharacterToCharSet(charSet, (jschar)(i)); continue; case 'W': - for (i = (REint32)(charSet->length - 1); i >= 0; i--) - if (!RE_ISLETDIG(i)) - addCharacterToCharSet(charSet, (REchar)(i)); + for (i = (intN)(charSet->length); i >= 0; i--) + if (!JS_ISWORD(i)) + addCharacterToCharSet(charSet, (jschar)(i)); continue; default: thisCh = c; @@ -1331,9 +1711,9 @@ lexHex: } if (inRange) { - if (pState->flags & RE_IGNORECASE) { - REchar minch = (REchar)65535; - REchar maxch = 0; + if (gData->regexp->flags & JSREG_FOLD) { + jschar minch = (jschar)65535; + jschar maxch = 0; /* yuk @@ -1354,750 +1734,785 @@ lexHex: } else addCharacterRangeToCharSet(charSet, rangeStart, thisCh); - inRange = RE_FALSE; + inRange = JS_FALSE; } else { - if (pState->flags & RE_IGNORECASE) + if (gData->regexp->flags & JSREG_FOLD) addCharacterToCharSet(charSet, canonicalize(thisCh)); addCharacterToCharSet(charSet, thisCh); if (src < (end - 1)) { if (*src == '-') { ++src; - inRange = RE_TRUE; + inRange = JS_TRUE; rangeStart = thisCh; } } } } - return RE_TRUE; + return JS_TRUE; } +void +js_DestroyRegExp(JSContext *cx, JSRegExp *re) +{ + uintN i; + if (re->classList) { + for (i = 0; i < re->classCount; i++) { + if (re->classList[i].converted) + JS_free(cx, re->classList[i].u.bits); + re->classList[i].u.bits = NULL; + } + JS_free(cx, re->classList); + } +} + +static JSBool +reallocStateStack(REGlobalData *gData) +{ + size_t sz = sizeof(REProgState) * gData->maxStateStack; + gData->maxStateStack <<= 1; + gData->stateStack + = (REProgState *)realloc(gData->stateStack, sz + sz); + if (!gData->stateStack) { + gData->ok = JS_FALSE; + return JS_FALSE; + } + return JS_TRUE; +} /* - Initialize the character set if it this is the first call. - Test the bit - if the ^ flag was specified, non-inclusion is a success +* Apply the current op against the given input to see if +* it's going to match or fail. Return false if we don't +* get a match, true if we do and update the state of the +* input and pc if the update flag is true. */ -static REMatchState *classMatcher(REGlobalData *gData, REMatchState *x, REint32 index) +static REMatchState *simpleMatch(REGlobalData *gData, REMatchState *x, + REOp op, jsbytecode **startpc, JSBool update) { - REchar ch; - RECharSet *charSet; - REint32 byteIndex; - REint32 e = x->endIndex; - if (e == gData->length) - return NULL; - -/* - if (target->data.chclass.charSet->bits == NULL) { - if (!processCharSet(((globalData->regexp->flags & IGNORECASE) != 0), target)) - return NULL; - } - charSet = target->data.chclass.charSet; -*/ - charSet = &gData->regexp->classList[index]; - - ch = gData->input[e]; - byteIndex = ch / 8; - if (charSet->sense) { - if ((charSet->length == 0) || - ( (ch > charSet->length) - || ((charSet->bits[byteIndex] & (1 << (ch & 0x7))) == 0) )) - return NULL; - } - else { - if (! ((charSet->length == 0) || - ( (ch > charSet->length) - || ((charSet->bits[byteIndex] & (1 << (ch & 0x7))) == 0) ))) - return NULL; - } - - if (charSet->length) /* match empty character */ - x->endIndex++; - return x; -} - - - -/* -1. Evaluate DecimalEscape to obtain an EscapeValue E. -2. If E is not a character then go to step 6. -3. Let ch be E's character. -4. Let A be a one-element RECharSet containing the character ch. -5. Call CharacterSetMatcher(A, RE_FALSE) and return its Matcher result. -6. E must be an integer. Let n be that integer. -7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. -8. Return an internal Matcher closure that takes two arguments, a State x - and a Continuation c, and performs the following: - 1. Let cap be x's captures internal array. - 2. Let s be cap[n]. - 3. If s is undefined, then call c(x) and return its result. - 4. Let e be x's endIndex. - 5. Let len be s's length. - 6. Let f be e+len. - 7. If f>InputLength, return failure. - 8. If there exists an integer i between 0 (inclusive) and len (exclusive) - such that Canonicalize(s[i]) is not the same character as - Canonicalize(Input [e+i]), then return failure. - 9. Let y be the State (f, cap). - 10. Call c(y) and return its result. -*/ - -static REMatchState *backrefMatcher(REGlobalData *gData, - REMatchState *x, REuint32 parenIndex) -{ - REuint32 e; - REuint32 len; - REint32 f; - REuint32 i; - const REchar *parenContent; - RECapture *s = &x->parens[parenIndex]; - if (s->index == -1) - return x; - - e = (REuint32)x->endIndex; - len = (REuint32)s->length; - f = (REint32)(e + len); - if (f > gData->length) - return NULL; - - parenContent = &gData->input[s->index]; - if (gData->regexp->flags & RE_IGNORECASE) { - for (i = 0; i < len; i++) { - if (canonicalize(parenContent[i]) - != canonicalize(gData->input[e + i])) - return NULL; - } - } - else { - for (i = 0; i < len; i++) { - if (parenContent[i] != gData->input[e + i]) - return NULL; - } - } - x->endIndex = f; - return x; -} - -/* - * free memory the RENode t and it's children - */ -static void freeRENode(RENode *t) -{ - RENode *n; - while (t) { - switch (t->kind) { - case REOP_ALT: - freeRENode((RENode *)(t->child)); - freeRENode((RENode *)(t->data.child2)); - break; - case REOP_QUANT: - freeRENode((RENode *)(t->child)); - break; - case REOP_PAREN: - freeRENode((RENode *)(t->child)); - break; - case REOP_ASSERT: - freeRENode((RENode *)(t->child)); - break; - case REOP_ASSERTNOT: - freeRENode((RENode *)(t->child)); - break; - } - n = t->next; - free(t); - t = n; - } -} - -#define ARG_LEN (2) -#define CHECK_RANGE(branch, target) (ASSERT((((target) - (branch)) >= -32768) && (((target) - (branch)) <= 32767))) -#define EMIT_ARG(pc, a) ((pc)[0] = (REuint8)((a) >> 8), (pc)[1] = (REuint8)(a), (pc) += ARG_LEN) -#define EMIT_BRANCH(pc) ((pc) += ARG_LEN) -#define EMIT_FIXUP(branch, target) (EMIT_ARG((branch), (target) - (branch))) -#define GET_ARG(pc) ((REuint32)(((pc)[0] << 8) | (pc)[1])) - -static REuint8 *emitREBytecode(REState *pState, REuint8 *pc, RENode *t) -{ - RENode *nextAlt; - REuint8 *nextAltFixup, *nextTermFixup; - - while (t) { - *pc++ = (REuint8)(t->kind); - switch (t->kind) { - case REOP_EMPTY: - --pc; - break; - case REOP_ALT: - nextAlt = (RENode *)(t->data.child2); - nextAltFixup = pc; - EMIT_BRANCH(pc); /* address of next alternate */ - pc = emitREBytecode(pState, pc, (RENode *)(t->child)); - *pc++ = REOP_ENDALT; - nextTermFixup = pc; - EMIT_BRANCH(pc); /* address of following term */ - CHECK_RANGE(nextAltFixup, pc); - EMIT_FIXUP(nextAltFixup, pc); - pc = emitREBytecode(pState, pc, nextAlt); - - *pc++ = REOP_ENDALT; - nextAltFixup = pc; - EMIT_BRANCH(pc); - - CHECK_RANGE(nextTermFixup, pc); - EMIT_FIXUP(nextTermFixup, pc); - - CHECK_RANGE(nextAltFixup, pc); - EMIT_FIXUP(nextAltFixup, pc); - break; - case REOP_FLAT: - if (t->child && (t->data.flat.length > 1)) { - if (pState->flags & RE_IGNORECASE) - pc[-1] = REOP_FLATNi; - else - pc[-1] = REOP_FLATN; - EMIT_ARG(pc, (REchar *)(t->child) - pState->srcStart); - EMIT_ARG(pc, t->data.flat.length); - } - else { /* XXX original Monkey code separated ASCII and Unicode cases to save extra byte */ - if (pState->flags & RE_IGNORECASE) - pc[-1] = REOP_FLAT1i; - else - pc[-1] = REOP_FLAT1; - EMIT_ARG(pc, t->data.flat.ch); - } - break; - case REOP_PAREN: - EMIT_ARG(pc, t->parenIndex); - pc = emitREBytecode(pState, pc, (RENode *)(t->child)); - *pc++ = REOP_CLOSEPAREN; - EMIT_ARG(pc, t->parenIndex); - break; - case REOP_BACKREF: - EMIT_ARG(pc, t->parenIndex); - break; - case REOP_ASSERT: - nextTermFixup = pc; - EMIT_BRANCH(pc); - pc = emitREBytecode(pState, pc, (RENode *)(t->child)); - *pc++ = REOP_ASSERTTEST; - CHECK_RANGE(nextTermFixup, pc); - EMIT_FIXUP(nextTermFixup, pc); - break; - case REOP_ASSERTNOT: - nextTermFixup = pc; - EMIT_BRANCH(pc); - pc = emitREBytecode(pState, pc, (RENode *)(t->child)); - *pc++ = REOP_ASSERTNOTTEST; - CHECK_RANGE(nextTermFixup, pc); - EMIT_FIXUP(nextTermFixup, pc); - break; - case REOP_QUANT: - if ((t->data.quantifier.min == 0) && (t->data.quantifier.max == -1)) - pc[-1] = (REuint8)((t->data.quantifier.greedy) - ? REOP_STAR : REOP_MINIMALSTAR); - else - if ((t->data.quantifier.min == 0) && (t->data.quantifier.max == 1)) - pc[-1] = (REuint8)((t->data.quantifier.greedy) - ? REOP_OPT : REOP_MINIMALOPT); - else - if ((t->data.quantifier.min == 1) && (t->data.quantifier.max == -1)) - pc[-1] = (REuint8)((t->data.quantifier.greedy) - ? REOP_PLUS : REOP_MINIMALPLUS); - else { - if (!t->data.quantifier.greedy) pc[-1] = REOP_MINIMALQUANT; - EMIT_ARG(pc, t->data.quantifier.min); - EMIT_ARG(pc, t->data.quantifier.max); - } - EMIT_ARG(pc, t->data.quantifier.parenCount); - EMIT_ARG(pc, t->parenIndex); - nextTermFixup = pc; - EMIT_BRANCH(pc); - pc = emitREBytecode(pState, pc, (RENode *)(t->child)); - *pc++ = REOP_ENDCHILD; - CHECK_RANGE(nextTermFixup, pc); - EMIT_FIXUP(nextTermFixup, pc); - break; - case REOP_CLASS: - EMIT_ARG(pc, t->data.chclass.classIndex); - processCharSet(pState, t); - break; - } - t = t->next; - } - return pc; -} - -static REBackTrackData *pushBackTrackState(REGlobalData *gData, REOp op, - REuint8 *target, REMatchState *x) -{ - REBackTrackData *result; - if (backTrackStackTop == maxBackTrack) { - maxBackTrack <<= 1; - backTrackStack = (REBackTrackData *)realloc(backTrackStack, - sizeof(REBackTrackData) * maxBackTrack); - if (!backTrackStack) { - reportRegExpError(&gData->error, RE_OUT_OF_MEMORY); - return NULL; - } - } - result = &backTrackStack[backTrackStackTop++]; - result->continuation.op = op; - result->continuation.pc = target; - result->state = copyState(x); - result->lastParen = gData->lastParen; - - result->precedingStateTop = (REint32)stateStackTop; - if (stateStackTop) { - result->precedingState = (REProgState *)malloc(sizeof(REProgState) - * stateStackTop); - if (!result->precedingState) { - reportRegExpError(&gData->error, RE_OUT_OF_MEMORY); - return NULL; - } - memcpy(result->precedingState, stateStack, sizeof(REProgState) - * stateStackTop); - } - else - result->precedingState = NULL; - - return result; -} - -static REMatchState *executeREBytecode(REuint8 *pc, REGlobalData *gData, REMatchState *x) -{ - REOp op = (REOp)(*pc++); - REContinuationData currentContinuation; REMatchState *result = NULL; - REBackTrackData *backTrackData; - REint32 k, length, offset, index; - REuint32 parenIndex; - REbool anchor = RE_FALSE; - REchar anchorCh = 0; - REchar matchCh; - REuint8 *nextpc; - REOp nextop; + jschar matchCh; + intN parenIndex; + intN offset, length, index; + jsbytecode *pc = *startpc; /* pc has already been incremented past op */ + const jschar *source; + const jschar *startcp = x->cp; + jschar ch; + RECharSet *charSet; - currentContinuation.pc = NULL; - currentContinuation.op = REOP_END; + + switch (op) { + default: + JS_ASSERT(JS_FALSE); + case REOP_BOL: + if (x->cp != gData->cpbegin) { + if (gData->regExpStatics->multiline || + (gData->regexp->flags & JSREG_MULTILINE)) { + if (!RE_IS_LINE_TERM(x->cp[-1])) + break; + } + else + break; + } + result = x; + break; + case REOP_EOL: + if (x->cp != gData->cpend) { + if (gData->regExpStatics->multiline || + (gData->regexp->flags & JSREG_MULTILINE)) { + if (!RE_IS_LINE_TERM(*x->cp)) + break; + } + else + break; + } + result = x; + break; + case REOP_WBDRY: + if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) + ^ !((x->cp != gData->cpend) && JS_ISWORD(*x->cp))) + result = x; + break; + case REOP_WNONBDRY: + if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) + ^ ((x->cp != gData->cpend) && JS_ISWORD(*x->cp))) + result = x; + break; + case REOP_DOT: + if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_DIGIT: + if (x->cp != gData->cpend && JS_ISDIGIT(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_NONDIGIT: + if (x->cp != gData->cpend && !JS_ISDIGIT(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_ALNUM: + if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_NONALNUM: + if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_SPACE: + if (x->cp != gData->cpend && JS_ISSPACE(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_NONSPACE: + if (x->cp != gData->cpend && !JS_ISSPACE(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_BACKREF: + parenIndex = GET_ARG(pc); + pc += ARG_LEN; + result = backrefMatcher(gData, x, parenIndex); + break; + case REOP_FLAT: + offset = GET_ARG(pc); + pc += ARG_LEN; + length = GET_ARG(pc); + pc += ARG_LEN; + source = JSSTRING_CHARS(gData->regexp->source) + offset; + if ((x->cp + length) <= gData->cpend) { + for (index = 0; index < length; index++) { + if (source[index] != x->cp[index]) + return NULL; + } + x->cp += length; + result = x; + } + break; + case REOP_FLAT1: + matchCh = *pc++; + if ((x->cp != gData->cpend) && (*x->cp == matchCh)) { + result = x; + result->cp++; + } + break; + case REOP_FLATi: + offset = GET_ARG(pc); + pc += ARG_LEN; + length = GET_ARG(pc); + pc += ARG_LEN; + source = JSSTRING_CHARS(gData->regexp->source); + result = flatNIMatcher(gData, x, source + offset, length); + break; + case REOP_FLAT1i: + matchCh = *pc++; + if ((x->cp != gData->cpend) + && (canonicalize(*x->cp) == canonicalize(matchCh))) { + result = x; + result->cp++; + } + break; + case REOP_UCFLAT1: + matchCh = GET_ARG(pc); + pc += ARG_LEN; + if ((x->cp != gData->cpend) && (*x->cp == matchCh)) { + result = x; + result->cp++; + } + break; + case REOP_UCFLAT1i: + matchCh = GET_ARG(pc); + pc += ARG_LEN; + if ((x->cp != gData->cpend) + && (canonicalize(*x->cp) == canonicalize(matchCh))) { + result = x; + result->cp++; + } + break; + case REOP_CLASS: + index = GET_ARG(pc); + pc += ARG_LEN; + if (x->cp != gData->cpend) { + charSet = &gData->regexp->classList[index]; + JS_ASSERT(charSet->converted); + ch = *x->cp; + index = ch / 8; + if ((charSet->length != 0) && + ( (ch <= charSet->length) + && ((charSet->u.bits[index] & (1 << (ch & 0x7))) != 0) )) { + result = x; + result->cp++; + } + } + break; + case REOP_NCLASS: + index = GET_ARG(pc); + pc += ARG_LEN; + if (x->cp != gData->cpend) { + charSet = &gData->regexp->classList[index]; + JS_ASSERT(charSet->converted); + ch = *x->cp; + index = ch / 8; + if ((charSet->length == 0) || + ( (ch > charSet->length) + || ((charSet->u.bits[index] & (1 << (ch & 0x7))) == 0) )) { + result = x; + result->cp++; + } + } + break; + } + if (result != NULL) { + if (update) + *startpc = pc; + else + x->cp = startcp; + return result; + } + x->cp = startcp; + return NULL; +} + +static REMatchState * +executeREBytecode(REGlobalData *gData, REMatchState *x) +{ + REMatchState *result; + REBackTrackData *backTrackData; + intN offset; + jsbytecode *nextpc; + REOp nextop; + RECapture *cap; + REProgState *curState; + const jschar *startcp; + uintN parenIndex, k; + uintN parenSoFar = 0; + + jschar matchCh1, matchCh2; + RECharSet *charSet; + + JSBool anchor; + jsbytecode *pc = gData->regexp->program; + REOp op = (REOp)(*pc++); /* - * If the first node is a literal match, step the index into + * If the first node is a simple match, step the index into * the string until that match is made, or fail if it can't be * found at all. */ - switch (op) { - case REOP_FLAT1: - case REOP_FLAT1i: - anchorCh = (REchar)GET_ARG(pc); - anchor = RE_TRUE; - break; - case REOP_FLATN: - case REOP_FLATNi: - k = (REint32)GET_ARG(pc); - anchorCh = gData->regexp->srcStart[k]; - anchor = RE_TRUE; - break; - } - if (anchor) { - anchor = RE_FALSE; - for (k = x->endIndex; k < gData->length; k++) { - matchCh = gData->input[k]; - if ((matchCh == anchorCh) || - ((gData->regexp->flags & RE_IGNORECASE) - && (canonicalize(matchCh) == canonicalize(anchorCh)))) { - x->endIndex = k; - x->startIndex = k; /* inform caller that we bumped along */ - anchor = RE_TRUE; + if (REOP_IS_SIMPLE(op)) { + anchor = JS_FALSE; + while (x->cp <= gData->cpend) { + nextpc = pc; /* reset back to start each time */ + result = simpleMatch(gData, x, op, &nextpc, JS_TRUE); + if (result) { + anchor = JS_TRUE; + x = result; + pc = nextpc; /* accept skip to next opcode */ + op = (REOp)(*pc++); break; } + else { + gData->skipped++; + x->cp++; + } } if (!anchor) return NULL; } - while (RE_TRUE) { - switch (op) { - case REOP_EMPTY: - result = x; - break; - case REOP_BOL: - result = bolMatcher(gData, x); - break; - case REOP_EOL: - result = eolMatcher(gData, x); - break; - case REOP_WBND: - result = wbndMatcher(gData, x, RE_TRUE); - break; - case REOP_UNWBND: - result = wbndMatcher(gData, x, RE_FALSE); - break; - case REOP_DOT: - result = dotMatcher(gData, x); - break; - case REOP_DEC: - result = decMatcher(gData, x, RE_TRUE); - break; - case REOP_UNDEC: - result = decMatcher(gData, x, RE_FALSE); - break; - case REOP_WS: - result = wsMatcher(gData, x, RE_TRUE); - break; - case REOP_UNWS: - result = wsMatcher(gData, x, RE_FALSE); - break; - case REOP_LETDIG: - result = letdigMatcher(gData, x, RE_TRUE); - break; - case REOP_UNLETDIG: - result = letdigMatcher(gData, x, RE_FALSE); - break; - case REOP_FLATN: - offset = (REint32)GET_ARG(pc); - pc += ARG_LEN; - length = (REint32)GET_ARG(pc); - pc += ARG_LEN; - result = flatNMatcher(gData, x, gData->regexp->srcStart + offset, - length); - break; - case REOP_FLATNi: - offset = (REint32)GET_ARG(pc); - pc += ARG_LEN; - length = (REint32)GET_ARG(pc); - pc += ARG_LEN; - result = flatNIMatcher(gData, x, gData->regexp->srcStart + offset, - length); - break; - case REOP_FLAT1: - matchCh = (REchar)GET_ARG(pc); - pc += ARG_LEN; - result = flatMatcher(gData, x, matchCh); - break; - case REOP_FLAT1i: - matchCh = (REchar)GET_ARG(pc); - pc += ARG_LEN; - result = flatIMatcher(gData, x, matchCh); - break; - - case REOP_ALT: - nextpc = pc + GET_ARG(pc); - nextop = (REOp)(*nextpc++); - stateStack[stateStackTop].continuation = currentContinuation; - ++stateStackTop; - pushBackTrackState(gData, nextop, nextpc, x); - pc += ARG_LEN; - op = (REOp)(*pc++); - continue; - - case REOP_ENDALT: - --stateStackTop; - currentContinuation = stateStack[stateStackTop].continuation; - offset = (REint32)GET_ARG(pc); - pc += offset; - op = (REOp)(*pc++); - continue; - - - case REOP_PAREN: - parenIndex = GET_ARG(pc); - pc += ARG_LEN; - x->parens[parenIndex].index = x->endIndex; - x->parens[parenIndex].length = 0; - op = (REOp)(*pc++); - continue; - case REOP_CLOSEPAREN: - parenIndex = GET_ARG(pc); - pc += ARG_LEN; - x->parens[parenIndex].length = x->endIndex - - x->parens[parenIndex].index; - if ((REint32)parenIndex > gData->lastParen) - gData->lastParen = (REint32)parenIndex; - op = (REOp)(*pc++); - continue; - case REOP_BACKREF: - parenIndex = GET_ARG(pc); - pc += ARG_LEN; - result = backrefMatcher(gData, x, (uint32)parenIndex); - break; - - case REOP_ASSERT: - stateStack[stateStackTop].continuation = currentContinuation; - stateStack[stateStackTop].parenCount = backTrackStackTop; - stateStack[stateStackTop].index = x->endIndex; - ++stateStackTop; - if (!pushBackTrackState(gData, REOP_ASSERTTEST, - pc + GET_ARG(pc), x)) - return NULL; - pc += ARG_LEN; - op = (REOp)(*pc++); - continue; - case REOP_ASSERTNOT: - stateStack[stateStackTop].continuation = currentContinuation; - stateStack[stateStackTop].parenCount = backTrackStackTop; - stateStack[stateStackTop].index = x->endIndex; - ++stateStackTop; - if (!pushBackTrackState(gData, REOP_ASSERTNOTTEST, - pc + GET_ARG(pc), x)) - return NULL; - pc += ARG_LEN; - op = (REOp)(*pc++); - continue; - case REOP_ASSERTTEST: - --stateStackTop; - x->endIndex = stateStack[stateStackTop].index; - for (k = stateStack[stateStackTop].parenCount; - k < backTrackStackTop; k++) { - if (backTrackStack[k].precedingState) - free(backTrackStack[k].precedingState); - free(backTrackStack[k].state); - } - backTrackStackTop = stateStack[stateStackTop].parenCount; - currentContinuation = stateStack[stateStackTop].continuation; - if (result != NULL) + while (JS_TRUE) { + if (REOP_IS_SIMPLE(op)) + result = simpleMatch(gData, x, op, &pc, JS_TRUE); + else { + curState = &gData->stateStack[gData->stateStackTop]; + switch (op) { + case REOP_EMPTY: result = x; - break; - case REOP_ASSERTNOTTEST: - --stateStackTop; - x->endIndex = stateStack[stateStackTop].index; - for (k = stateStack[stateStackTop].parenCount; - k < backTrackStackTop; k++) { - if (backTrackStack[k].precedingState) - free(backTrackStack[k].precedingState); - free(backTrackStack[k].state); - } - backTrackStackTop = stateStack[stateStackTop].parenCount; - currentContinuation = stateStack[stateStackTop].continuation; - if (result == NULL) - result = x; - else - result = NULL; - break; - - case REOP_CLASS: - index = (int32)GET_ARG(pc); - pc += ARG_LEN; - result = classMatcher(gData, x, index); - if (gData->error != RE_NO_ERROR) return NULL; - break; - - case REOP_END: - if (x != NULL) - return x; - break; - - case REOP_STAR: - stateStack[stateStackTop].min = 0; - stateStack[stateStackTop].max = -1; - goto quantcommon; - case REOP_PLUS: - stateStack[stateStackTop].min = 1; - stateStack[stateStackTop].max = -1; - goto quantcommon; - case REOP_OPT: - stateStack[stateStackTop].min = 0; - stateStack[stateStackTop].max = 1; - goto quantcommon; - case REOP_QUANT: - stateStack[stateStackTop].min = (int32)GET_ARG(pc); - pc += ARG_LEN; - stateStack[stateStackTop].max = (int32)GET_ARG(pc); - pc += ARG_LEN; -quantcommon: - stateStack[stateStackTop].parenCount = (int32)GET_ARG(pc); - pc += ARG_LEN; - stateStack[stateStackTop].parenIndex = (int32)GET_ARG(pc); - pc += ARG_LEN; - stateStack[stateStackTop].index = x->endIndex; - stateStack[stateStackTop].continuation = currentContinuation; - ++stateStackTop; - currentContinuation.op = REOP_REPEAT; - currentContinuation.pc = pc; - if (!pushBackTrackState(gData, REOP_REPEAT, pc, x)) return NULL; - pc += ARG_LEN; - op = (REOp)(*pc++); - continue; - - case REOP_ENDCHILD: - pc = currentContinuation.pc; - op = currentContinuation.op; - continue; - - case REOP_REPEAT: - --stateStackTop; - if (result == NULL) { - /* - * There's been a failure, see if we have enough children - */ - currentContinuation = stateStack[stateStackTop].continuation; - if (stateStack[stateStackTop].min == 0) - result = x; - pc = pc + GET_ARG(pc); break; - } - else { - if ((stateStack[stateStackTop].min == 0) - && (x->endIndex == stateStack[stateStackTop].index)) { - /* matched an empty string, that'll get us nowhere */ - result = NULL; - currentContinuation = stateStack[stateStackTop].continuation; - pc = pc + GET_ARG(pc); - break; - } - if (stateStack[stateStackTop].min > 0) - stateStack[stateStackTop].min--; - if (stateStack[stateStackTop].max != -1) - stateStack[stateStackTop].max--; - if (stateStack[stateStackTop].max == 0) { - result = x; - currentContinuation = stateStack[stateStackTop].continuation; - pc = pc + GET_ARG(pc); - break; - } - stateStack[stateStackTop].index = x->endIndex; - ++stateStackTop; - currentContinuation.op = REOP_REPEAT; - currentContinuation.pc = pc; - if (!pushBackTrackState(gData, REOP_REPEAT, pc, x)) return NULL; + + case REOP_ALTPREREQ2: + nextpc = pc + GET_OFFSET(pc); /* start of next op */ pc += ARG_LEN; - op = (REOp)(*pc++); - parenIndex = (REuint32)stateStack[stateStackTop - 1].parenIndex; - for (k = 0; k <= stateStack[stateStackTop - 1].parenCount; k++) - x->parens[parenIndex + k].index = -1; - } - continue; - - case REOP_MINIMALSTAR: - stateStack[stateStackTop].min = 0; - stateStack[stateStackTop].max = -1; - goto minimalquantcommon; - case REOP_MINIMALPLUS: - stateStack[stateStackTop].min = 1; - stateStack[stateStackTop].max = -1; - goto minimalquantcommon; - case REOP_MINIMALOPT: - stateStack[stateStackTop].min = 0; - stateStack[stateStackTop].max = 1; - goto minimalquantcommon; - case REOP_MINIMALQUANT: - stateStack[stateStackTop].min = (int32)GET_ARG(pc); - pc += ARG_LEN; - stateStack[stateStackTop].max = (int32)GET_ARG(pc); - pc += ARG_LEN; -minimalquantcommon: - stateStack[stateStackTop].parenCount = (int32)GET_ARG(pc); - pc += ARG_LEN; - stateStack[stateStackTop].parenIndex = (int32)GET_ARG(pc); - pc += ARG_LEN; - stateStack[stateStackTop].index = x->endIndex; - stateStack[stateStackTop].continuation = currentContinuation; - ++stateStackTop; - if (stateStack[stateStackTop - 1].min > 0) { - currentContinuation.op = REOP_MINIMALREPEAT; - currentContinuation.pc = pc; + matchCh2 = GET_ARG(pc); + pc += ARG_LEN; + k = GET_ARG(pc); pc += ARG_LEN; - op = (REOp)(*pc++); - } - else { - if (!pushBackTrackState(gData, REOP_MINIMALREPEAT, pc, x)) - return NULL; - --stateStackTop; - pc = pc + GET_ARG(pc); - op = (REOp)(*pc++); - } - continue; - case REOP_MINIMALREPEAT: - --stateStackTop; - currentContinuation = stateStack[stateStackTop].continuation; - - if (result == NULL) { - /* - * Non-greedy failure - try to consume another child - */ - if ((stateStack[stateStackTop].max == -1) - || (stateStack[stateStackTop].max > 0)) { - parenIndex = (REuint32)stateStack[stateStackTop].parenIndex; - for (k = 0; k <= stateStack[stateStackTop].parenCount; k++) - x->parens[parenIndex + k].index = -1; - stateStack[stateStackTop].index = x->endIndex; - stateStack[stateStackTop].continuation = currentContinuation; - ++stateStackTop; - currentContinuation.op = REOP_MINIMALREPEAT; - currentContinuation.pc = pc; - pc += ARG_LEN; - op = (REOp)(*pc++); - continue; + if (x->cp != gData->cpend) { + charSet = &gData->regexp->classList[k]; + if (!charSet->converted) + if (!processCharSet(gData, charSet)) + return JS_FALSE; + matchCh1 = *x->cp; + k = matchCh1 / 8; + if ((charSet->length != 0) && + ( (matchCh1 <= charSet->length) + && ((charSet->u.bits[k] + & (1 << (matchCh1 & 0x7))) != 0) )) { + result = NULL; + break; + } } else { - currentContinuation = stateStack[stateStackTop].continuation; - break; - } - } - else { - if ((stateStack[stateStackTop].min == 0) - && (x->endIndex == stateStack[stateStackTop].index)) { - /* matched an empty string, that'll get us nowhere */ result = NULL; - currentContinuation = stateStack[stateStackTop].continuation; break; } - if (stateStack[stateStackTop].min > 0) - stateStack[stateStackTop].min--; - if (stateStack[stateStackTop].max != -1) - stateStack[stateStackTop].max--; - if (stateStack[stateStackTop].min > 0) { - parenIndex = (REuint32)stateStack[stateStackTop].parenIndex; - for (k = 0; k <= stateStack[stateStackTop].parenCount; k++) - x->parens[parenIndex + k].index = -1; - stateStack[stateStackTop].index = x->endIndex; - ++stateStackTop; - currentContinuation.op = REOP_MINIMALREPEAT; - currentContinuation.pc = pc; - pc += ARG_LEN; - op = (REOp)(*pc++); - continue; + + if ((x->cp == gData->cpend) || (*x->cp != matchCh2)) { + result = NULL; + break; } - else { - stateStack[stateStackTop].index = x->endIndex; - ++stateStackTop; - if (!pushBackTrackState(gData, REOP_MINIMALREPEAT, pc, x)) + goto doAlt; + + case REOP_ALTPREREQ: + nextpc = pc + GET_OFFSET(pc); /* start of next op */ + pc += ARG_LEN; + matchCh1 = GET_ARG(pc); + pc += ARG_LEN; + matchCh2 = GET_ARG(pc); + pc += ARG_LEN; + if ((x->cp == gData->cpend) + || ((*x->cp != matchCh1) && (*x->cp != matchCh2))) { + result = NULL; + break; + } + /* else false thru... */ + + case REOP_ALT: +doAlt: + nextpc = pc + GET_OFFSET(pc); /* start of next alternate */ + pc += ARG_LEN; /* start of this alternate */ + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) return NULL; - --stateStackTop; - pc = pc + GET_ARG(pc); + op = (REOp)(*pc++); + startcp = x->cp; + if (REOP_IS_SIMPLE(op)) { + if (!simpleMatch(gData, x, op, &pc, JS_TRUE)) { + op = (REOp)(*nextpc++); + pc = nextpc; + continue; + } + else { /* accept the match and move on */ + result = x; + op = (REOp)(*pc++); + } + } + nextop = (REOp)(*nextpc++); + if (!pushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0)) + return NULL; + continue; + + /* + * Occurs at (succesful) end of REOP_ALT, + */ + case REOP_JUMP: + --gData->stateStackTop; + offset = GET_OFFSET(pc); + pc += offset; + op = (REOp)(*pc++); + continue; + + /* + * Occurs at last (succesful) end of REOP_ALT, + */ + case REOP_ENDALT: + --gData->stateStackTop; + op = (REOp)(*pc++); + continue; + + case REOP_LPAREN: + parenIndex = GET_ARG(pc); + if ((parenIndex + 1) > parenSoFar) + parenSoFar = parenIndex + 1; + pc += ARG_LEN; + x->parens[parenIndex].index = x->cp - gData->cpbegin; + x->parens[parenIndex].length = 0; + op = (REOp)(*pc++); + continue; + case REOP_RPAREN: + parenIndex = GET_ARG(pc); + pc += ARG_LEN; + cap = &x->parens[parenIndex]; + cap->length = x->cp - (gData->cpbegin + cap->index); + op = (REOp)(*pc++); + continue; + + case REOP_ASSERT: + nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */ + pc += ARG_LEN; /* start of ASSERT child */ + op = (REOp)(*pc++); + if (REOP_IS_SIMPLE(op) + && !simpleMatch(gData, x, op, &pc, JS_FALSE)) { + result = NULL; + break; + } + else { + curState->u.assertion.top + = (char *)gData->backTrackSP + - (char *)gData->backTrackStack; + curState->u.assertion.sz = gData->cursz; + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (!pushBackTrackState(gData, REOP_ASSERTTEST, + nextpc, x, x->cp, 0, 0)) + return NULL; + } + continue; + case REOP_ASSERT_NOT: + nextpc = pc + GET_OFFSET(pc); + pc += ARG_LEN; + op = (REOp)(*pc++); + if (REOP_IS_SIMPLE(op) + /* Note - fail to fail! */ + && simpleMatch(gData, x, op, &pc, JS_FALSE)) { + result = NULL; + break; + } + else { + curState->u.assertion.top + = (char *)gData->backTrackSP + - (char *)gData->backTrackStack; + curState->u.assertion.sz = gData->cursz; + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (!pushBackTrackState(gData, REOP_ASSERTNOTTEST, + nextpc, x, x->cp, 0, 0)) + return NULL; + } + continue; + case REOP_ASSERTTEST: + --gData->stateStackTop; + --curState; + x->cp = gData->cpbegin + curState->index; + gData->backTrackSP + = (REBackTrackData *)((char *)gData->backTrackStack + + curState->u.assertion.top); + gData->cursz = curState->u.assertion.sz; + if (result != NULL) + result = x; + break; + case REOP_ASSERTNOTTEST: + --gData->stateStackTop; + --curState; + x->cp = gData->cpbegin + curState->index; + gData->backTrackSP + = (REBackTrackData *)((char *)gData->backTrackStack + + curState->u.assertion.top); + gData->cursz = curState->u.assertion.sz; + if (result == NULL) + result = x; + else + result = NULL; + break; + + case REOP_END: + if (x != NULL) + return x; + break; + + case REOP_STAR: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = -1; + goto quantcommon; + case REOP_PLUS: + curState->u.quantifier.min = 1; + curState->u.quantifier.max = -1; + goto quantcommon; + case REOP_OPT: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = 1; + goto quantcommon; + case REOP_QUANT: + curState->u.quantifier.min = GET_ARG(pc); + pc += ARG_LEN; + curState->u.quantifier.max = GET_ARG(pc); + pc += ARG_LEN; +quantcommon: + if (curState->u.quantifier.max == 0) { + pc = pc + GET_OFFSET(pc); op = (REOp)(*pc++); + result = x; continue; } + /* Step over */ + nextpc = pc + ARG_LEN; + op = (REOp)(*nextpc++); + startcp = x->cp; + if (REOP_IS_SIMPLE(op)) { + if (!simpleMatch(gData, x, op, &nextpc, JS_TRUE)) { + if (curState->u.quantifier.min == 0) + result = x; + else + result = NULL; + pc = pc + GET_OFFSET(pc); + break; + } + else { + op = (REOp)(*nextpc++); + result = x; + } + } + curState->index = startcp - gData->cpbegin; + curState->continue_op = REOP_REPEAT; + curState->continue_pc = pc; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (curState->u.quantifier.min == 0) + if (!pushBackTrackState(gData, REOP_REPEAT, + pc, x, startcp, 0, 0)) + return NULL; + pc = nextpc; + continue; + + case REOP_ENDCHILD: /* marks the end of a quantifier child */ + pc = curState[-1].continue_pc; + op = curState[-1].continue_op; + continue; + + case REOP_REPEAT: + --curState; +repeatAgain: + --gData->stateStackTop; + if (result == NULL) { + /* + * There's been a failure, see if we have enough children. + */ + if (curState->u.quantifier.min == 0) { + result = x; + goto repeatDone; + } + break; + } + else { + if ((curState->u.quantifier.min == 0) + && (x->cp == gData->cpbegin + curState->index)) { + /* matched an empty string, that'll get us nowhere */ + result = NULL; + break; + } + if (curState->u.quantifier.min != 0) + curState->u.quantifier.min--; + if (curState->u.quantifier.max != (uint16)(-1)) + curState->u.quantifier.max--; + if (curState->u.quantifier.max == 0) { + result = x; + goto repeatDone; + } + nextpc = pc + ARG_LEN; + nextop = (REOp)(*nextpc); + startcp = x->cp; + if (REOP_IS_SIMPLE(nextop)) { + nextpc++; + if (!simpleMatch(gData, x, nextop, &nextpc, JS_TRUE)) { + if (curState->u.quantifier.min == 0) { + result = x; + goto repeatDone; + } + else + result = NULL; + break; + } + result = x; + } + curState->index = startcp - gData->cpbegin; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (curState->u.quantifier.min == 0) + if (!pushBackTrackState(gData, REOP_REPEAT, + pc, x, startcp, + curState->parenSoFar, + parenSoFar + - curState->parenSoFar)) + return NULL; + if (*nextpc == REOP_ENDCHILD) + goto repeatAgain; + pc = nextpc; + op = (REOp)(*pc++); + parenSoFar = curState->parenSoFar; + } + continue; +repeatDone: + pc = pc + GET_OFFSET(pc); + break; + + + case REOP_MINIMALSTAR: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = -1; + goto minimalquantcommon; + case REOP_MINIMALPLUS: + curState->u.quantifier.min = 1; + curState->u.quantifier.max = -1; + goto minimalquantcommon; + case REOP_MINIMALOPT: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = 1; + goto minimalquantcommon; + case REOP_MINIMALQUANT: + curState->u.quantifier.min = GET_ARG(pc); + pc += ARG_LEN; + curState->u.quantifier.max = GET_ARG(pc); + pc += ARG_LEN; +minimalquantcommon: + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (curState->u.quantifier.min != 0) { + curState->continue_op = REOP_MINIMALREPEAT; + curState->continue_pc = pc; + /* step over */ + pc += ARG_LEN; + op = (REOp)(*pc++); + } + else { + if (!pushBackTrackState(gData, REOP_MINIMALREPEAT, + pc, x, x->cp, 0, 0)) + return NULL; + --gData->stateStackTop; + pc = pc + GET_OFFSET(pc); + op = (REOp)(*pc++); + } + continue; + + case REOP_MINIMALREPEAT: + --gData->stateStackTop; + --curState; + + if (result == NULL) { + /* + * Non-greedy failure - try to consume another child. + */ + if ((curState->u.quantifier.max == (uint16)(-1)) + || (curState->u.quantifier.max > 0)) { + curState->index = x->cp - gData->cpbegin; + curState->continue_op = REOP_MINIMALREPEAT; + curState->continue_pc = pc; + pc += ARG_LEN; + for (k = curState->parenSoFar; k < parenSoFar; k++) + x->parens[k].index = -1; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + op = (REOp)(*pc++); + continue; + } + else { + /* Don't need to adjust pc since we're going to pop. */ + break; + } + } + else { + if ((curState->u.quantifier.min == 0) + && (x->cp == gData->cpbegin + curState->index)) { + /* Matched an empty string, that'll get us nowhere. */ + result = NULL; + break; + } + if (curState->u.quantifier.min != 0) + curState->u.quantifier.min--; + if (curState->u.quantifier.max != (uint16)(-1)) + curState->u.quantifier.max--; + if (curState->u.quantifier.min != 0) { + curState->continue_op = REOP_MINIMALREPEAT; + curState->continue_pc = pc; + pc += ARG_LEN; + for (k = curState->parenSoFar; k < parenSoFar; k++) + x->parens[k].index = -1; + curState->index = x->cp - gData->cpbegin; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + op = (REOp)(*pc++); + continue; + } + else { + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (!pushBackTrackState(gData, REOP_MINIMALREPEAT, + pc, x, x->cp, + curState->parenSoFar, + parenSoFar + - curState->parenSoFar)) + return NULL; + --gData->stateStackTop; + pc = pc + GET_OFFSET(pc); + op = (REOp)(*pc++); + continue; + } + } + + default: + JS_ASSERT(JS_FALSE); + } - - - } /* * If the match failed and there's a backtrack option, take it. - * Otherwise this is a match failure. + * Otherwise this is a complete and utter failure. */ if (result == NULL) { - if (backTrackStackTop > 0) { - backTrackStackTop--; - backTrackData = &backTrackStack[backTrackStackTop]; - - gData->lastParen = backTrackData->lastParen; - - recoverState(x, backTrackData->state); - free(backTrackData->state); - - for (k = 0; k < backTrackData->precedingStateTop; k++) { - stateStack[k] = backTrackData->precedingState[k]; - } - stateStackTop = (REuint32)backTrackData->precedingStateTop; - if (backTrackData->precedingState) - free(backTrackData->precedingState); - - if (stateStackTop > 0) - currentContinuation = stateStack[stateStackTop - 1].continuation; + if (gData->cursz > 0) { + backTrackData = gData->backTrackSP; + gData->cursz = backTrackData->sz; + gData->backTrackSP + = (REBackTrackData *)((char *)backTrackData + - backTrackData->sz); + x->cp = backTrackData->cp; + pc = backTrackData->backtrack_pc; + op = backTrackData->backtrack_op; + gData->stateStackTop = backTrackData->precedingStateTop; + JS_ASSERT(gData->stateStackTop); - pc = backTrackData->continuation.pc; - op = backTrackData->continuation.op; + memcpy(gData->stateStack, backTrackData + 1, + sizeof(REProgState) * backTrackData->precedingStateTop); + curState = &gData->stateStack[gData->stateStackTop - 1]; + + if (backTrackData->parenCount) { + memcpy(&x->parens[backTrackData->parenIndex], + (char *)(backTrackData + 1) + sizeof(REProgState) * backTrackData->precedingStateTop, + sizeof(RECapture) * backTrackData->parenCount); + parenSoFar = backTrackData->parenIndex + backTrackData->parenCount; + } + else { + for (k = curState->parenSoFar; k < parenSoFar; k++) + x->parens[k].index = -1; + parenSoFar = curState->parenSoFar; + } continue; } else @@ -2107,164 +2522,81 @@ minimalquantcommon: x = result; /* - * Continue with the expression. If this the end of the child, use - * the current continuation. + * Continue with the expression. */ op = (REOp)*pc++; - if (op == REOP_ENDCHILD) { - pc = currentContinuation.pc; - op = currentContinuation.op; - } } return NULL; } -/* - * Throw away the RegExp and all data associated with it. - */ -void REfreeRegExp(REState *pState) -{ - REuint32 i; - if (pState->result) freeRENode(pState->result); - if (pState->pc) free(pState->pc); - for (i = 0; i < pState->classCount; i++) { - free(pState->classList[i].bits); - } - if (pState->srcStart) free(pState->srcStart); - free(pState->classList); - free(pState); -} - -RE_Error parseFlags(const REchar *flagsSource, REint32 flagsLength, REuint32 *flags) -{ - REint32 i; - *flags = 0; - - for (i = 0; i < flagsLength; i++) { - switch (flagsSource[i]) { - case 'g': - *flags |= RE_GLOBAL; break; - case 'i': - *flags |= RE_IGNORECASE; break; - case 'm': - *flags |= RE_MULTILINE; break; - default: - return RE_BAD_FLAG; - } - } - return RE_NO_ERROR; -} - -/* -* Parse the regexp - errors are reported via the registered error function -* and NULL is returned. Otherwise the regexp is compiled and the completed -* ParseState returned. -*/ -REState *REParse(const REchar *source, REint32 sourceLength, - REuint32 flags, RE_Version version) -{ - REuint8 *endPC; - RENode *t; - REState *pState = (REState *)malloc(sizeof(REState)); - if (!pState) return NULL; - pState->srcStart = (REchar *)malloc(sizeof(REchar) * sourceLength); - if (!pState->srcStart) goto fail; - memcpy(pState->srcStart, source, sizeof(REchar) * sourceLength); - pState->srcEnd = pState->srcStart + sourceLength; - pState->src = pState->srcStart; - pState->parenCount = 0; - pState->flags = flags; - pState->version = version; - pState->classList = NULL; - pState->classCount = 0; - pState->codeLength = 0; - - if (parseDisjunction(pState)) { - t = pState->result; - if (t) { - while (t->next) t = t->next; - t->next = newRENode(pState, REOP_END); - if (!t->next) - goto fail; - } - else - pState->result = newRENode(pState, REOP_END); - if (pState->classCount) { - pState->classList = (RECharSet *)malloc(sizeof(RECharSet) - * pState->classCount); - if (!pState->classList) goto fail; - } - pState->pc = (REuint8 *)malloc(sizeof(REuint8) * pState->codeLength + 1); - if (!pState->pc) goto fail; - endPC = emitREBytecode(pState, pState->pc, pState->result); - freeRENode(pState->result); - pState->result = NULL; - ASSERT(endPC <= (pState->pc + (pState->codeLength + 1))); - return pState; - } -fail: - if (pState->srcStart) free(pState->srcStart); - if (pState->classList) free(pState->classList); - free(pState); - return NULL; -} - -static REMatchState *initMatch(REGlobalData *gData, REState *pState, - const REchar *text, REint32 length, int globalMultiline) +static REMatchState * +MatchRegExp(REGlobalData *gData, REMatchState *x) { REMatchState *result; - REint32 j; + const jschar *cp = x->cp; + const jschar *cp2; + uintN j; - if (!backTrackStack) { - maxBackTrack = INITIAL_BACKTRACK; - backTrackStack = (REBackTrackData *)malloc(sizeof(REBackTrackData) - * maxBackTrack); - if (!backTrackStack) { - reportRegExpError(&gData->error, RE_OUT_OF_MEMORY); - return NULL; - } - } - if (!stateStack) { - maxStateStack = INITIAL_STATESTACK; - stateStack = (REProgState *)malloc(sizeof(REProgState) - * maxStateStack); - if (!stateStack) { - reportRegExpError(&gData->error, RE_OUT_OF_MEMORY); - return NULL; - } + /* + * Have to include the position beyond the last character + * in order to detect end-of-input/line condition. + */ + for (cp2 = cp; cp2 <= gData->cpend; cp2++) { + gData->skipped = cp2 - cp; + x->cp = cp2; + for (j = 0; j < gData->regexp->parenCount; j++) + x->parens[j].index = -1; + result = executeREBytecode(gData, x); + if (!gData->ok || result) + return result; + gData->backTrackSP = gData->backTrackStack; + gData->cursz = 0; + gData->stateStackTop = 0; + cp2 = cp + gData->skipped; } + return NULL; +} + + +static REMatchState * +initMatch(JSContext *cx, REGlobalData *gData, JSRegExp *re) +{ + REMatchState *result; + uintN i; + + gData->maxBackTrack = INITIAL_BACKTRACK; + gData->backTrackStack = (REBackTrackData *)malloc(INITIAL_BACKTRACK); + + if (!gData->backTrackStack) + return NULL; + gData->backTrackSP = gData->backTrackStack; + gData->cursz = 0; + + + gData->maxStateStack = INITIAL_STATESTACK; + gData->stateStack = (REProgState *)malloc(sizeof(REProgState) * INITIAL_STATESTACK); + if (!gData->stateStack) + return NULL; + gData->stateStackTop = 0; + + gData->regexp = re; + gData->ok = JS_TRUE; result = (REMatchState *)malloc(sizeof(REMatchState) - + (pState->parenCount * sizeof(RECapture))); - if (!result) { - reportRegExpError(&gData->error, RE_OUT_OF_MEMORY); + + (re->parenCount - 1) * sizeof(RECapture)); + if (!result) return NULL; - } - result->parenCount = (REint32)pState->parenCount; - for (j = 0; j < result->parenCount; j++) - result->parens[j].index = -1; - result->startIndex = 0; - result->endIndex = 0; + for (i = 0; i < re->classCount; i++) + if (!re->classList[i].converted) + if (!processCharSet(gData, &re->classList[i])) + return NULL; - pState->error = RE_NO_ERROR; - - gData->regexp = pState; - gData->input = text; - gData->length = length; - gData->error = RE_NO_ERROR; - gData->lastParen = 0; - gData->globalMultiline = (REbool)globalMultiline; - - backTrackStackTop = 0; - stateStackTop = 0; return result; } -/* - * The [[Match]] implementation - * - */ +#if 0 +// Execute the re against the string, but don't try advancing into the string REMatchState *REMatch(REState *pState, const REchar *text, REint32 length) { REint32 j; @@ -2284,10 +2616,7 @@ REMatchState *REMatch(REState *pState, const REchar *text, REint32 length) return result; } -/* - * Execute the RegExp against the supplied text, filling in the REMatchState. - * - */ +// Execute the re against the string starting at the index, return NULL for failure REMatchState *REExecute(REState *pState, const REchar *text, REint32 offset, REint32 length, int globalMultiline) { REMatchState *result; @@ -2328,107 +2657,84 @@ REMatchState *REExecute(REState *pState, const REchar *text, REint32 offset, REi } return result; } - -#ifdef STANDALONE - -REchar *widen(char *str, int length) -{ - int i; - REchar *result = (REchar *)malloc(sizeof(REchar) * (length + 1)); - for (i = 0; i < length; i++) - result[i] = str[i]; - return result; -} - -REchar canonicalize(REchar ch) -{ - if ((ch >= 'a') && (ch <= 'z')) - return (ch - 'a') + 'A'; - else - return ch; -} - -int main(int argc, char* argv[]) -{ - char regexpInput[128]; - char *regexpSrc; - char str[128]; - REMatchState *result; - int regexpLength; - char *flagSrc; - int flagSrcLength; - REint32 i, j; - - printf("Delimit regexp by / / (with flags following) and strings by \" \"\n"); - while (RE_TRUE) { - REchar *regexpWideSrc; - REchar *flagWideSrc; - REState *pState; - - printf("regexp : "); - scanf("%s", regexpInput); - regexpSrc = regexpInput; - if (*regexpSrc != '/') - break; - regexpSrc++; - flagSrc = strrchr(regexpSrc, '/'); - if (flagSrc == NULL) - break; - regexpLength = flagSrc - regexpSrc; - if (flagSrc[1]) { - flagSrc++; - flagSrcLength = strlen(flagSrc); - } - else { - flagSrc = NULL; - flagSrcLength = 0; - } - - regexpWideSrc = widen(regexpSrc, regexpLength); - flagWideSrc = widen(flagSrc, flagSrcLength); - pState = REParse(regexpWideSrc, regexpLength, - flagWideSrc, flagSrcLength, RE_TRUE); - if (pState) { - while (RE_TRUE) { - printf("string : "); - scanf("%s", str); - if (*str != '"') - break; - else { - int strLength = strlen(str + 1) - 1; - REchar *widestr = widen(str + 1, strLength); - result = REExecute(pState, widestr, strLength); - if (result) { - printf("\""); - for (i = result->startIndex; i < result->endIndex; i++) - printf("%c", str[i + 1]); - printf("\""); - for (i = 0; i < result->n; i++) { - printf(","); - if (result->parens[i].index != -1) { - printf("\""); - for (j = 0; j < result->parens[i].length; j++) - printf("%c", str[j + 1 + result->parens[i].index]); - printf("\""); - } - else - printf("undefined"); - } - printf("\n"); - free(result); - } - else - printf("failed\n"); - free(widestr); - } - } - freeRegExp(pState); - } - else - printf("regexp failed to parse\n"); - free(regexpWideSrc); - free(flagWideSrc); - } - return 0; -} #endif +// Compile the flag source and build a flag bit set. Return true/false for success/failure +bool parseFlags(const jschar *flagStr, uint32 length, uint32 *flags) +{ + uint32 i; + *flags = 0; + for (i = 0; i < length; i++) { + switch (flagStr[i]) { + case 'g': + *flags |= JSREG_GLOB; break; + case 'i': + *flags |= JSREG_FOLD; break; + case 'm': + *flags |= JSREG_MULTILINE; break; + default: + return false; + } + } + return true; +} + +#define JS_HOWMANY(x,y) (((x)+(y)-1)/(y)) +#define JS_ROUNDUP(x,y) (JS_HOWMANY(x,y)*(y)) + +// Compile the source re, return NULL for failure (error functions called) +JSRegExp *RECompile(const jschar *str, uint32 length, uint32 flags) +{ + JSRegExp *re; + CompilerState state; + size_t resize; + jsbytecode *endPC; + uint32 i; + size_t len; + + re = NULL; + state.reNodePool = new Pool(32); + + state.cpbegin = state.cp = JSSTRING_CHARS(str); + state.cpend = state.cp + length; + state.flags = flags; + state.parenCount = 0; + state.classCount = 0; + state.progLength = 0; + state.treeDepth = 0; + for (i = 0; i < CLASS_CACHE_SIZE; i++) + state.classCache[i].start = NULL; + + len = length; + if (!parseRegExp(&state)) + goto out; + + resize = sizeof *re + state.progLength + 1; + re = (JSRegExp *) JS_malloc(cx, JS_ROUNDUP(resize, sizeof(uint32))); + if (!re) + goto out; + + re->classCount = state.classCount; + if (state.classCount) { + re->classList = (RECharSet *)JS_malloc(cx, sizeof(RECharSet) + * state.classCount); + if (!re->classList) + goto out; + } + else + re->classList = NULL; + endPC = emitREBytecode(&state, re, state.treeDepth, re->program, state.result); + if (!endPC) { + re = NULL; + goto out; + } + *endPC++ = REOP_END; + JS_ASSERT(endPC <= (re->program + (state.progLength + 1))); + + re->parenCount = state.parenCount; + re->flags = flags; + re->source = str; + +out: + delete state.reNodePool; + return re; +} \ No newline at end of file diff --git a/mozilla/js2/src/regexp/regexp.h b/mozilla/js2/src/regexp/regexp.h index 9be153fd6e7..735be67559a 100644 --- a/mozilla/js2/src/regexp/regexp.h +++ b/mozilla/js2/src/regexp/regexp.h @@ -31,144 +31,84 @@ * file under either the NPL or the GPL. */ -#ifdef __GNUC__ - /* GCC's wchar_t is 32 bits, so we can't use it. */ - typedef uint16 char16; - typedef uint16 uchar16; -#else - typedef wchar_t char16; - typedef wchar_t uchar16; -#endif -typedef char16 REchar; - -typedef unsigned int REuint32; -typedef int REint32; -typedef unsigned char REuint8; +/* + * This struct holds a bitmap representation of a class from a regexp. + * There's a list of these referenced by the classList field in the JSRegExp + * struct below. The initial state has startIndex set to the offset in the + * original regexp source of the beginning of the class contents. The first + * use of the class converts the source representation into a bitmap. + * + */ -typedef enum RE_Flags { - RE_IGNORECASE = 0x1, - RE_GLOBAL = 0x2, - RE_MULTILINE = 0x4 -} RE_Flags; +typedef uint32 jsint; +typedef char16 jschar; +typedef bool JSBool; +typedef uint32 uintN; +typedef int32 intN; +typedef uint8 jsbytecode; +typedef char16 JSString; +typedef char16 JSSubString; -typedef enum RE_Version { - RE_VERSION_1, /* octal literal support */ - RE_VERSION_2 -} RE_Version; - -typedef enum RE_Error { - RE_NO_ERROR, - RE_TRAILING_SLASH, /* a backslash just before the end of the RE */ - RE_UNCLOSED_PAREN, /* mis-matched parens */ - RE_UNCLOSED_BRACKET, /* mis-matched parens */ - RE_UNCLOSED_CLASS, /* '[' missing ']' */ - RE_BACKREF_IN_CLASS, /* used '\' in '[..]' */ - RE_BAD_FLAG, /* unrecognized flag (not i, g or m) */ - RE_WRONG_RANGE, /* range lo > range hi */ - RE_OUT_OF_MEMORY -} RE_Error; - -typedef struct RENode RENode; typedef struct RECharSet { - REuint8 *bits; - REuint32 length; - unsigned char sense; + bool converted; + bool sense; + uint16 length; + union { + uint8 *bits; + struct { + uint16 startIndex; + uint16 length; + } src; + } u; } RECharSet; -typedef struct REState { - REchar *srcStart; /* copy of source text */ - REchar *src; /* current parse position */ - REchar *srcEnd; /* end of source text */ - REuint32 parenCount; /* # capturing parens */ - REuint32 flags; /* union of flags from regexp */ - RE_Version version; - RE_Error error; /* parse-time or runtime error */ - REuint32 classCount; /* number of contained []'s */ - RECharSet *classList; /* data for []'s */ - RENode *result; /* head of result tree */ - REint32 codeLength; /* length of bytecode */ - REuint8 *pc; /* start of bytecode */ -} REState; +#define JSREG_FOLD 0x01 /* fold uppercase to lowercase */ +#define JSREG_GLOB 0x02 /* global exec, creates array of matches */ +#define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */ + + +struct JSRegExp { + uint32 parenCount:24, /* number of parenthesized submatches */ + flags:8; /* flags, see above JSREG_* defines */ + uint32 classCount; /* count [...] bitmaps */ + RECharSet *classList; /* list of [...] bitmaps */ + const jschar *source; /* locked source string, sans // */ + jsbytecode program[1]; /* regular expression bytecode */ +}; typedef struct RECapture { - REint32 index; /* start of contents of this capture, -1 for empty */ - REint32 length; /* length of capture */ + int16 index; /* start of contents, -1 for empty */ + int16 length; /* length of capture */ } RECapture; -typedef struct REMatchState { - REint32 startIndex; /* beginning of succesful match */ - REint32 endIndex; /* character beyond end of succesful match */ - REint32 parenCount; /* set to (n - 1), i.e. for /((a)b)/, this field is 1 */ - RECapture parens[1]; /* first of 'n' captures, allocated at end of this struct */ -} REMatchState; +struct REMatchResult { + uint32 startIndex; + uint32 endIndex; + uint32 parenCount; + RECapture parens[1]; /* first of 'parenCount' captures, + * allocated at end of this struct. + */ +}; +namespace JavaScript { +namespace MetaData { +class JS2Metadata; -/* - * Compiles the flags source text into a union of flag values. Returns RE_NO_ERROR - * or RE_BAD_FLAG. - * - */ -RE_Error parseFlags(const REchar *flagsSource, REint32 flagsLength, REuint32 *flags); +// Execute the re against the string starting at the index, return NULL for failure +REMatchResult *REExecute(JS2Metadata *meta, JSRegExp *re, const jschar *str, uint32 index, uint32 length, bool globalMultiline); -/* - * Compiles the RegExp source into a stream of REByteCodes and fills in the REState struct. - * Errors are recorded in the state 'error' field and signalled by a NULL return. - * The RegExp source does not have any delimiters. - */ -REState *REParse(const REchar *source, REint32 sourceLength, REuint32 flags, RE_Version version); +// Compile the source re, return NULL for failure (error functions called) +JSRegExp *RECompile(JS2Metadata *meta, const jschar *str, uint32 length, uint32 flags); +// Compile the flag source and build a flag bit set. Return true/false for success/failure +bool parseFlags(JS2Metadata *meta, const jschar *flagStr, uint32 length, uint32 *flags); -/* - * Execute the RegExp against the supplied text. - * The return value is NULL for no match, otherwise an REMatchState struct. - * - */ -REMatchState *REExecute(REState *pState, const REchar *text, REint32 offset, REint32 length, int globalMulitline); - - -/* - * The [[Match]] implementation, applies the regexp at the start of the text - * only (i.e. it does not search repeatedly through the text for a match). - * NULL return for no match. - * - */ -REMatchState *REMatch(REState *pState, const REchar *text, REint32 length); - - - -/* - * Throw away the RegExp and all data associated with it. - */ -void REfreeRegExp(REState *pState); - - - - -/* - * Needs to be provided by the host, following these specs: - * - * - * [1. If IgnoreCase is false, return ch. - not necessary in implementation] - * - * 2. Let u be ch converted to upper case as if by calling - * String.prototype.toUpperCase on the one-character string ch. - * 3. If u does not consist of a single character, return ch. - * 4. Let cu be u's character. - * 5. If ch's code point value is greater than or equal to decimal 128 and cu's - * code point value is less than decimal 128, then return ch. - * 6. Return cu. - */ -extern REchar canonicalize(REchar ch); - -/* - * The host should also provide a definition of whitespace to match the following: - * - */ -#ifndef RE_ISSPACE -#define RE_ISSPACE(c) ( (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r') || (c == '\v') || (c == '\f') ) -#endif +// Execute the re against the string, but don't try advancing into the string +REMatchResult *REMatch(JS2Metadata *meta, JSRegExp *re, const jschar *str, uint32 length); +} +} diff --git a/mozilla/js2/src/regexpwrapper.cpp b/mozilla/js2/src/regexpwrapper.cpp index 2ae8a973b74..7072df1789d 100644 --- a/mozilla/js2/src/regexpwrapper.cpp +++ b/mozilla/js2/src/regexpwrapper.cpp @@ -31,18 +31,2914 @@ * file under either the NPL or the GPL. */ -#include "systemtypes.h" -#include "utilities.h" -#include "strings.h" +#ifdef _WIN32 +#include "msvc_pragma.h" +#endif -extern "C" char16 canonicalize(char16 ch) + +#include +#include + +#include "world.h" +#include "strings.h" +#include "utilities.h" +#include "js2value.h" + +#include +#include +#include +#include + +#include "reader.h" +#include "parser.h" +#include "js2engine.h" +#include "regexp.h" +#include "bytecodecontainer.h" +#include "js2metadata.h" + + +namespace JavaScript { +namespace MetaData { + +void JS_ReportOutOfMemory(JS2Metadata *meta) { - char16 cu = JavaScript::toUpper(ch); + meta->reportError(Exception::internalError, "out of memory", 0); +} + +void js_ReportCompileErrorNumber(JS2Metadata *meta, const char *message, const jschar *extra) +{ + meta->reportError(Exception::syntaxError, message, 0, extra); +} + +void JS_ReportErrorNumber(JS2Metadata *meta, const char *message) +{ + meta->reportError(Exception::syntaxError, message, 0); +} + + +#define JS_FALSE false +#define JS_TRUE true + +#define JSMSG_MISSING_PAREN "Missing parentheses, {0}" +#define JSMSG_TOO_MANY_PARENS "Too many parentheses, {0}" +#define JSMSG_BAD_QUANTIFIER "Bad quantifer specification, {0}" +#define JSMSG_MIN_TOO_BIG "Range minimun too big, {0}" +#define JSMSG_MAX_TOO_BIG "Range maximum too big, {0}" +#define JSMSG_OUT_OF_ORDER "Range out of order, {0}" +#define JSMSG_BAD_CLASS_RANGE "Bad range in class" +#define JSMSG_TRAILING_SLASH "Trailing slash, {0}" +#define JSMSG_UNTERM_CLASS "Unterminated class, {0}" +#define JSMSG_UNTERM_QUANTIFIER "Unterminated quantifier" + +#define LINE_SEPARATOR 0x2028 +#define PARAGRAPH_SEPARATOR 0x2029 + + + +#define JS_ASSERT(x) ASSERT(x) +#define JSSTRING_CHARS(x) (x) + + +#define JS7_ISDEC(c) ((c) < 128 && isdigit(c)) +#define JS7_UNDEC(c) ((c) - '0') +#define JS7_ISLET(c) ((c) < 128 && isalpha(c)) + +#define JUMP_OFFSET_LEN 2 +#define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8)) +#define JUMP_OFFSET_LO(off) ((jsbytecode)(off)) +#define GET_JUMP_OFFSET(pc) ((int16)(((pc)[1] << 8) | (pc)[2])) +#define SET_JUMP_OFFSET(pc,off) ((pc)[1] = JUMP_OFFSET_HI(off), \ + (pc)[2] = JUMP_OFFSET_LO(off)) +#define JUMP_OFFSET_MIN ((int16)0x8000) +#define JUMP_OFFSET_MAX ((int16)0x7fff) + + + + +typedef struct REMatchState { + const jschar *cp; + RECapture parens[1]; /* first of 're->parenCount' captures, + * allocated at end of this struct. + */ +} REMatchState; + +/* Note : contiguity of 'simple opcodes' is important for simpleMatch() */ +typedef enum REOp { + REOP_EMPTY = 0, /* match rest of input against rest of r.e. */ + REOP_ALT = 1, /* alternative subexpressions in kid and next */ + REOP_SIMPLE_START = 2, /* start of 'simple opcodes' */ + REOP_BOL = 2, /* beginning of input (or line if multiline) */ + REOP_EOL = 3, /* end of input (or line if multiline) */ + REOP_WBDRY = 4, /* match "" at word boundary */ + REOP_WNONBDRY = 5, /* match "" at word non-boundary */ + REOP_DOT = 6, /* stands for any character */ + REOP_DIGIT = 7, /* match a digit char: [0-9] */ + REOP_NONDIGIT = 8, /* match a non-digit char: [^0-9] */ + REOP_ALNUM = 9, /* match an alphanumeric char: [0-9a-z_A-Z] */ + REOP_NONALNUM = 10, /* match a non-alphanumeric char: [^0-9a-z_A-Z] */ + REOP_SPACE = 11, /* match a whitespace char */ + REOP_NONSPACE = 12, /* match a non-whitespace char */ + REOP_BACKREF = 13, /* back-reference (e.g., \1) to a parenthetical */ + REOP_FLAT = 14, /* match a flat string */ + REOP_FLAT1 = 15, /* match a single char */ + REOP_FLATi = 16, /* case-independent REOP_FLAT */ + REOP_FLAT1i = 17, /* case-independent REOP_FLAT1 */ + REOP_UCFLAT1 = 18, /* single Unicode char */ + REOP_UCFLAT1i = 19, /* case-independent REOP_UCFLAT1 */ + REOP_UCFLAT = 20, /* flat Unicode string; len immediate counts chars */ + REOP_UCFLATi = 21, /* case-independent REOP_UCFLAT */ + REOP_CLASS = 22, /* character class with index */ + REOP_NCLASS = 23, /* negated character class with index */ + REOP_SIMPLE_END = 23, /* end of 'simple opcodes' */ + REOP_QUANT = 25, /* quantified atom: atom{1,2} */ + REOP_STAR = 26, /* zero or more occurrences of kid */ + REOP_PLUS = 27, /* one or more occurrences of kid */ + REOP_OPT = 28, /* optional subexpression in kid */ + REOP_LPAREN = 29, /* left paren bytecode: kid is u.num'th sub-regexp */ + REOP_RPAREN = 30, /* right paren bytecode */ + REOP_JUMP = 31, /* for deoptimized closure loops */ + REOP_DOTSTAR = 32, /* optimize .* to use a single opcode */ + REOP_ANCHOR = 33, /* like .* but skips left context to unanchored r.e. */ + REOP_EOLONLY = 34, /* $ not preceded by any pattern */ + REOP_BACKREFi = 37, /* case-independent REOP_BACKREF */ + REOP_LPARENNON = 41, /* non-capturing version of REOP_LPAREN */ + REOP_ASSERT = 43, /* zero width positive lookahead assertion */ + REOP_ASSERT_NOT = 44, /* zero width negative lookahead assertion */ + REOP_ASSERTTEST = 45, /* sentinel at end of assertion child */ + REOP_ASSERTNOTTEST = 46, /* sentinel at end of !assertion child */ + REOP_MINIMALSTAR = 47, /* non-greedy version of * */ + REOP_MINIMALPLUS = 48, /* non-greedy version of + */ + REOP_MINIMALOPT = 49, /* non-greedy version of ? */ + REOP_MINIMALQUANT = 50, /* non-greedy version of {} */ + REOP_ENDCHILD = 51, /* sentinel at end of quantifier child */ + REOP_REPEAT = 52, /* directs execution of greedy quantifier */ + REOP_MINIMALREPEAT = 53, /* directs execution of non-greedy quantifier */ + REOP_ALTPREREQ = 54, /* prerequisite for ALT, either of two chars */ + REOP_ALTPREREQ2 = 55, /* prerequisite for ALT, a char or a class */ + REOP_ENDALT = 56, /* end of final alternate */ + REOP_CONCAT = 57, /* concatenation of terms (parse time only) */ + + REOP_END +} REOp; + +#define REOP_IS_SIMPLE(op) (((op) >= REOP_SIMPLE_START) && ((op) <= REOP_SIMPLE_END)) + +struct RENode { + REOp op; /* r.e. op bytecode */ + RENode *next; /* next in concatenation order */ + void *kid; /* first operand */ + union { + void *kid2; /* second operand */ + jsint num; /* could be a number */ + jsint parenIndex; /* or a parenthesis index */ + struct { /* or a quantifier range */ + uint16 min; + uint16 max; + JSBool greedy; + } range; + struct { /* or a character class */ + uint16 startIndex; + uint16 kidlen; /* length of string at kid, in jschars */ + uint16 bmsize; /* bitmap size, based on max char code */ + uint16 index; /* index into class list */ + JSBool sense; + } ucclass; + struct { /* or a literal sequence */ + jschar chr; /* of one character */ + uint16 length; /* or many (via the kid) */ + } flat; + struct { + RENode *kid2; /* second operand from ALT */ + jschar ch1; /* match char for ALTPREREQ */ + jschar ch2; /* ditto, or class index for ALTPREREQ2 */ + } altprereq; + } u; +}; + + +#define RE_IS_LETTER(c) ( ((c >= 'A') && (c <= 'Z')) || \ + ((c >= 'a') && (c <= 'z')) ) +#define RE_IS_LINE_TERM(c) ( (c == '\n') || (c == '\r') || \ + (c == LINE_SEPARATOR) || (c == PARAGRAPH_SEPARATOR)) + +#define CLASS_CACHE_SIZE (4) +typedef struct CompilerState { + JS2Metadata *meta; + Pool *reNodePool; + bool strict; + const jschar *cpbegin; + const jschar *cpend; + const jschar *cp; + uintN flags; + uint16 parenCount; + uint16 classCount; /* number of [] encountered */ + size_t progLength; /* estimated bytecode length */ + uintN treeDepth; /* maximum depth of parse tree */ + RENode *result; + struct { + const jschar *start; /* small cache of class strings */ + uint16 length; /* since they're often the same */ + uint16 index; + } classCache[CLASS_CACHE_SIZE]; +} CompilerState; + +typedef struct REProgState { + jsbytecode *continue_pc; /* current continuation data */ + REOp continue_op; + int16 index; /* progress in text */ + uintN parenSoFar; /* highest indexed paren started */ + union { + struct { + uint16 min; /* current quantifier limits */ + uint16 max; + } quantifier; + struct { + size_t top; /* backtrack stack state */ + size_t sz; + } assertion; + } u; +} REProgState; + +typedef struct REBackTrackData { + size_t sz; /* size of previous stack entry */ + jsbytecode *backtrack_pc; /* where to backtrack to */ + REOp backtrack_op; + const jschar *cp; /* index in text of match at backtrack */ + intN parenIndex; /* start index of saved paren contents */ + uint16 parenCount; /* # of saved paren contents */ + uint16 precedingStateTop; /* number of parent states */ + /* saved parent states follow */ + /* saved paren contents follow */ +} REBackTrackData; + +#define INITIAL_STATESTACK (100) +#define INITIAL_BACKTRACK (8000) + +typedef struct REGlobalData { + JSBool globalMultiline; + JSRegExp *regexp; /* the RE in execution */ + JSBool ok; /* runtime error (out_of_memory only?) */ + size_t start; /* offset to start at */ + ptrdiff_t skipped; /* chars skipped anchoring this r.e. */ + const jschar *cpbegin, *cpend; /* text base address and limit */ + + REProgState *stateStack; /* stack of state of current parents */ + uint16 stateStackTop; + uint16 maxStateStack; + + REBackTrackData *backTrackStack;/* stack of matched-so-far positions */ + REBackTrackData *backTrackSP; + size_t maxBackTrack; + size_t cursz; /* size of current stack entry */ + +} REGlobalData; + +bool JS_ISWORD(jschar ch) +{ + CharInfo chi(ch); + return ch == '_' || isAlphanumeric(chi); +} + +bool JS_ISSPACE(jschar ch) +{ + CharInfo chi(ch); + return isSpace(chi); +} + +bool JS_ISDIGIT(jschar ch) +{ + CharInfo chi(ch); + return isDecimalDigit(chi); +} + +/* + * 1. If IgnoreCase is false, return ch. + * 2. Let u be ch converted to upper case as if by calling + * String.prototype.toUpperCase on the one-character string ch. + * 3. If u does not consist of a single character, return ch. + * 4. Let cu be u's character. + * 5. If ch's code point value is greater than or equal to decimal 128 and cu's + * code point value is less than decimal 128, then return ch. + * 6. Return cu. + */ +static jschar +canonicalize(jschar ch) +{ + jschar cu = toUpper(ch); if ((ch >= 128) && (cu < 128)) return ch; return cu; } +/* Construct and initialize an RENode, returning NULL for out-of-memory */ +static RENode * +NewRENode(CompilerState *state, REOp op) +{ + RENode *ren; + ren = new (*state->reNodePool) RENode(); + + if (!ren) { + JS_ReportOutOfMemory(state->meta); + return NULL; + } + ren->op = op; + ren->next = NULL; + ren->kid = NULL; + return ren; +} + +/* + * Validates and converts hex ascii value. + */ +static JSBool +isASCIIHexDigit(jschar c, uintN *digit) +{ + uintN cv = c; + + if (cv < '0') + return JS_FALSE; + if (cv <= '9') { + *digit = cv - '0'; + return JS_TRUE; + } + cv |= 0x20; + if (cv >= 'a' && cv <= 'f') { + *digit = cv - 'a' + 10; + return JS_TRUE; + } + return JS_FALSE; +} -#include "regexp.c" +typedef struct { + REOp op; + const jschar *errPos; + uint16 parenIndex; +} REOpData; + + +/* + * Process the op against the two top operands, reducing them to a single + * operand in the penultimate slot. Update progLength and treeDepth. + */ +static JSBool +processOp(CompilerState *state, REOpData *opData, RENode **operandStack, intN operandSP) +{ + RENode *result; + + switch (opData->op) { + case REOP_ALT: + result = NewRENode(state, REOP_ALT); + if (!result) + return JS_FALSE; + result->kid = operandStack[operandSP - 2]; + result->u.kid2 = operandStack[operandSP - 1]; + operandStack[operandSP - 2] = result; + /* + * look at both alternates to see if there's a FLAT or a CLASS at + * the start of each. If so, use a prerequisite match + */ + ++state->treeDepth; + if ((((RENode *)(result->kid))->op == REOP_FLAT) + && (((RENode *)(result->u.kid2))->op == REOP_FLAT) + && ((state->flags & JSREG_FOLD) == 0) ) { + result->op = REOP_ALTPREREQ; + result->u.altprereq.ch1 + = ((RENode *)(result->kid))->u.flat.chr; + result->u.altprereq.ch2 + = ((RENode *)(result->u.kid2))->u.flat.chr; + /* ALTPREREQ, , uch1, uch2, , ..., + JUMP, ... ENDALT */ + state->progLength += 13; + } + else + if ((((RENode *)(result->kid))->op == REOP_CLASS) + && (((RENode *)(result->kid))->u.ucclass.index < 256) + && (((RENode *)(result->u.kid2))->op == REOP_FLAT) + && ((state->flags & JSREG_FOLD) == 0) ) { + result->op = REOP_ALTPREREQ2; + result->u.altprereq.ch1 + = ((RENode *)(result->u.kid2))->u.flat.chr; + result->u.altprereq.ch2 + = ((RENode *)(result->kid))->u.ucclass.index; + /* ALTPREREQ2, , uch1, uch2, , ..., + JUMP, ... ENDALT */ + state->progLength += 13; + } + else + if ((((RENode *)(result->kid))->op == REOP_FLAT) + && (((RENode *)(result->u.kid2))->op == REOP_CLASS) + && (((RENode *)(result->u.kid2))->u.ucclass.index < 256) + && ((state->flags & JSREG_FOLD) == 0) ) { + result->op = REOP_ALTPREREQ2; + result->u.altprereq.ch1 + = ((RENode *)(result->kid))->u.flat.chr; + result->u.altprereq.ch2 + = ((RENode *)(result->u.kid2))->u.ucclass.index; + /* ALTPREREQ2, , uch1, uch2, , ..., + JUMP, ... ENDALT */ + state->progLength += 13; + } + else + /* ALT, , ..., JUMP, ... ENDALT */ + state->progLength += 7; + break; + case REOP_CONCAT: + result = operandStack[operandSP - 2]; + while (result->next) + result = result->next; + result->next = operandStack[operandSP - 1]; + break; + case REOP_ASSERT: + case REOP_ASSERT_NOT: + case REOP_LPARENNON: + case REOP_LPAREN: + /* These should have been processed by a close paren. */ + js_ReportCompileErrorNumber(state->meta, JSMSG_MISSING_PAREN, opData->errPos); + return JS_FALSE; + } + return JS_TRUE; +} + +/* + * Parser forward declarations. + */ +static JSBool parseTerm(CompilerState *state); +static JSBool parseQuantifier(CompilerState *state); + +/* + * Top-down regular expression grammar, based closely on Perl4. + * + * regexp: altern A regular expression is one or more + * altern '|' regexp alternatives separated by vertical bar. + */ + +#define INITIAL_STACK_SIZE (128) +static JSBool +parseRegExp(CompilerState *state) +{ + const jschar *errPos; + uint16 parenIndex; + RENode *operand; + REOpData *operatorStack; + RENode **operandStack; + REOp op; + intN i; + JSBool result = JS_FALSE; + + intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE; + intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE; + + /* Watch out for empty regexp */ + if (state->cp == state->cpend) { + state->result = NewRENode(state, REOP_EMPTY); + return JS_TRUE; + } + + operatorStack = (REOpData *)malloc(sizeof(REOpData) * operatorStackSize); + if (!operatorStack) + return JS_FALSE; + + operandStack = (RENode **)malloc(sizeof(RENode *) * operandStackSize); + if (!operandStack) + goto out; + + + while (JS_TRUE) { + if (state->cp != state->cpend) { + switch (*state->cp) { + /* balance '(' */ + case '(': /* balance ')' */ + errPos = state->cp; + ++state->cp; + if ((state->cp < state->cpend) && (*state->cp == '?') + && ( (state->cp[1] == '=') + || (state->cp[1] == '!') + || (state->cp[1] == ':') )) { + ++state->cp; + if (state->cp == state->cpend) { + js_ReportCompileErrorNumber(state->meta, JSMSG_MISSING_PAREN, + errPos); + goto out; + } + switch (*state->cp++) { + case '=': + op = REOP_ASSERT; + /* ASSERT, , ... ASSERTTEST */ + state->progLength += 4; + break; + case '!': + op = REOP_ASSERT_NOT; + /* ASSERTNOT, , ... ASSERTNOTTEST */ + state->progLength += 4; + break; + case ':': + op = REOP_LPARENNON; + break; + } + parenIndex = state->parenCount; + } + else { + op = REOP_LPAREN; + /* LPAREN, , ... RPAREN, */ + state->progLength += 6; + parenIndex = state->parenCount++; + if (state->parenCount == 0) { + js_ReportCompileErrorNumber(state->meta, JSMSG_TOO_MANY_PARENS, + errPos); + goto out; + } + } + goto pushOperator; + case '|': + case ')': + /* Expected an operand before these, so make an empty one */ + operand = NewRENode(state, REOP_EMPTY); + if (!operand) + goto out; + goto pushOperand; + default: + if (!parseTerm(state)) + goto out; + operand = state->result; +pushOperand: + if (operandSP == operandStackSize) { + operandStackSize += operandStackSize; + operandStack = + (RENode **)realloc(operandStack, + sizeof(RENode *) * operandStackSize); + if (!operandStack) + goto out; + } + operandStack[operandSP++] = operand; + break; + } + } + /* At the end; process remaining operators */ +restartOperator: + if (state->cp == state->cpend) { + while (operatorSP) { + --operatorSP; + if (!processOp(state, &operatorStack[operatorSP], + operandStack, operandSP)) + goto out; + --operandSP; + } + JS_ASSERT(operandSP == 1); + state->result = operandStack[0]; + result = JS_TRUE; + goto out; + } + switch (*state->cp) { + case '|': + /* Process any stacked 'concat' operators */ + ++state->cp; + while (operatorSP + && (operatorStack[operatorSP - 1].op == REOP_CONCAT)) { + --operatorSP; + if (!processOp(state, &operatorStack[operatorSP], + operandStack, operandSP)) + goto out; + --operandSP; + } + op = REOP_ALT; + goto pushOperator; + + case ')': + /* If there's not a stacked open parentheses,we + * accept the close as a flat. + */ + for (i = operatorSP - 1; i >= 0; i--) + if ((operatorStack[i].op == REOP_ASSERT) + || (operatorStack[i].op == REOP_ASSERT_NOT) + || (operatorStack[i].op == REOP_LPARENNON) + || (operatorStack[i].op == REOP_LPAREN)) + break; + if (i == -1) { + if (!parseTerm(state)) + goto out; + operand = state->result; + goto pushOperand; + } + ++state->cp; + /* process everything on the stack until the open */ + while (JS_TRUE) { + JS_ASSERT(operatorSP); + --operatorSP; + switch (operatorStack[operatorSP].op) { + case REOP_ASSERT: + case REOP_ASSERT_NOT: + case REOP_LPAREN: + operand = NewRENode(state, operatorStack[operatorSP].op); + if (!operand) + goto out; + operand->u.parenIndex + = operatorStack[operatorSP].parenIndex; + JS_ASSERT(operandSP); + operand->kid = operandStack[operandSP - 1]; + operandStack[operandSP - 1] = operand; + ++state->treeDepth; + /* fall thru... */ + case REOP_LPARENNON: + state->result = operandStack[operandSP - 1]; + if (!parseQuantifier(state)) + goto out; + operandStack[operandSP - 1] = state->result; + goto restartOperator; + default: + if (!processOp(state, &operatorStack[operatorSP], + operandStack, operandSP)) + goto out; + --operandSP; + break; + } + } + break; + default: + /* Anything else is the start of the next term */ + op = REOP_CONCAT; +pushOperator: + if (operatorSP == operatorStackSize) { + operatorStackSize += operatorStackSize; + operatorStack = + (REOpData *)realloc(operatorStack, + sizeof(REOpData) * operatorStackSize); + if (!operatorStack) + goto out; + } + operatorStack[operatorSP].op = op; + operatorStack[operatorSP].errPos = errPos; + operatorStack[operatorSP++].parenIndex = parenIndex; + break; + } + } +out: + if (operatorStack) + free(operatorStack); + if (operandStack) + free(operandStack); + return result; +} + +/* + * Extract and return a decimal value at state->cp, the + * initial character 'c' has already been read. + */ +static intN +getDecimalValue(jschar c, CompilerState *state) +{ + intN value = JS7_UNDEC(c); + while (state->cp < state->cpend) { + c = *state->cp; + if (!JS7_ISDEC(c)) + break; + value = (10 * value) + JS7_UNDEC(c); + ++state->cp; + } + return value; +} + +/* + * Calculate the total size of the bitmap required for a class expression. + */ +static JSBool +calculateBitmapSize(CompilerState *state, RENode *target, const jschar *src, + const jschar *end) +{ + jschar rangeStart, c; + uintN n, digit, nDigits, i; + uintN max = 0; + JSBool inRange = JS_FALSE; + + target->u.ucclass.bmsize = 0; + target->u.ucclass.sense = JS_TRUE; + + if (src == end) + return JS_TRUE; + + if (*src == '^') { + ++src; + target->u.ucclass.sense = JS_FALSE; + } + + while (src != end) { + uintN localMax = 0; + switch (*src) { + case '\\': + ++src; + c = *src++; + switch (c) { + case 'b': + localMax = 0x8; + break; + case 'f': + localMax = 0xC; + break; + case 'n': + localMax = 0xA; + break; + case 'r': + localMax = 0xD; + break; + case 't': + localMax = 0x9; + break; + case 'v': + localMax = 0xB; + break; + case 'c': + if (((src + 1) < end) && RE_IS_LETTER(src[1])) + localMax = (jschar)(*src++ & 0x1F); + else + localMax = '\\'; + break; + case 'x': + nDigits = 2; + goto lexHex; + case 'u': + nDigits = 4; +lexHex: + n = 0; + for (i = 0; (i < nDigits) && (src < end); i++) { + c = *src++; + if (!isASCIIHexDigit(c, &digit)) { + /* + * Back off to accepting the original + *'\' as a literal. + */ + src -= (i + 1); + n = '\\'; + break; + } + n = (n << 4) | digit; + } + localMax = n; + break; + case 'd': + if (inRange) { + JS_ReportErrorNumber(state->meta, JSMSG_BAD_CLASS_RANGE); + return JS_FALSE; + } + localMax = '9'; + break; + case 'D': + case 's': + case 'S': + case 'w': + case 'W': + if (inRange) { + JS_ReportErrorNumber(state->meta, JSMSG_BAD_CLASS_RANGE); + return JS_FALSE; + } + target->u.ucclass.bmsize = 65535; + return JS_TRUE; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* + * This is a non-ECMA extension - decimal escapes (in this + * case, octal!) are supposed to be an error inside class + * ranges, but supported here for backwards compatibility. + * + */ + n = JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + n = 8 * n + JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + i = 8 * n + JS7_UNDEC(c); + if (i <= 0377) + n = i; + else + src--; + } + } + localMax = n; + break; + + default: + localMax = c; + break; + } + break; + default: + localMax = *src++; + break; + } + if (inRange) { + if (rangeStart > localMax) { + JS_ReportErrorNumber(state->meta, JSMSG_BAD_CLASS_RANGE); + return JS_FALSE; + } + inRange = JS_FALSE; + } + else { + if (src < (end - 1)) { + if (*src == '-') { + ++src; + inRange = JS_TRUE; + rangeStart = (jschar)localMax; + continue; + } + } + } + if (state->flags & JSREG_FOLD) { + c = canonicalize((jschar)localMax); + if (c > localMax) + localMax = c; + } + if (localMax > max) + max = localMax; + } + target->u.ucclass.bmsize = max; + return JS_TRUE; +} + +/* + * item: assertion An item is either an assertion or + * quantatom a quantified atom. + * + * assertion: '^' Assertions match beginning of string + * (or line if the class static property + * RegExp.multiline is true). + * '$' End of string (or line if the class + * static property RegExp.multiline is + * true). + * '\b' Word boundary (between \w and \W). + * '\B' Word non-boundary. + * + * quantatom: atom An unquantified atom. + * quantatom '{' n ',' m '}' + * Atom must occur between n and m times. + * quantatom '{' n ',' '}' Atom must occur at least n times. + * quantatom '{' n '}' Atom must occur exactly n times. + * quantatom '*' Zero or more times (same as {0,}). + * quantatom '+' One or more times (same as {1,}). + * quantatom '?' Zero or one time (same as {0,1}). + * + * any of which can be optionally followed by '?' for ungreedy + * + * atom: '(' regexp ')' A parenthesized regexp (what matched + * can be addressed using a backreference, + * see '\' n below). + * '.' Matches any char except '\n'. + * '[' classlist ']' A character class. + * '[' '^' classlist ']' A negated character class. + * '\f' Form Feed. + * '\n' Newline (Line Feed). + * '\r' Carriage Return. + * '\t' Horizontal Tab. + * '\v' Vertical Tab. + * '\d' A digit (same as [0-9]). + * '\D' A non-digit. + * '\w' A word character, [0-9a-z_A-Z]. + * '\W' A non-word character. + * '\s' A whitespace character, [ \b\f\n\r\t\v]. + * '\S' A non-whitespace character. + * '\' n A backreference to the nth (n decimal + * and positive) parenthesized expression. + * '\' octal An octal escape sequence (octal must be + * two or three digits long, unless it is + * 0 for the null character). + * '\x' hex A hex escape (hex must be two digits). + * '\u' unicode A unicode escape (must be four digits). + * '\c' ctrl A control character, ctrl is a letter. + * '\' literalatomchar Any character except one of the above + * that follow '\' in an atom. + * otheratomchar Any character not first among the other + * atom right-hand sides. + */ +static JSBool +parseTerm(CompilerState *state) +{ + jschar c = *state->cp++; + uintN nDigits; + uintN parenBaseCount = state->parenCount; + uintN num, tmp, n, i; + const jschar *termStart; + JSBool foundCachedCopy; + + switch (c) { + /* assertions and atoms */ + case '^': + state->result = NewRENode(state, REOP_BOL); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; + case '$': + state->result = NewRENode(state, REOP_EOL); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; + case '\\': + if (state->cp >= state->cpend) { + /* a trailing '\' is an error */ + js_ReportCompileErrorNumber(state->meta, JSMSG_TRAILING_SLASH, state->cp); + return JS_FALSE; + } + c = *state->cp++; + switch (c) { + /* assertion escapes */ + case 'b' : + state->result = NewRENode(state, REOP_WBDRY); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; + case 'B': + state->result = NewRENode(state, REOP_WNONBDRY); + if (!state->result) + return JS_FALSE; + state->progLength++; + return JS_TRUE; + /* Decimal escape */ + case '0': + if (state->strict) + c = 0; + else { + doOctal: + num = 0; + while (state->cp < state->cpend) { + if ('0' <= (c = *state->cp) && c <= '7') { + state->cp++; + tmp = 8 * num + (uintN)JS7_UNDEC(c); + if (tmp > 0377) + break; + num = tmp; + } + else + break; + } + c = (jschar)(num); + } + doFlat: + state->result = NewRENode(state, REOP_FLAT); + if (!state->result) + return JS_FALSE; + state->result->u.flat.chr = c; + state->result->u.flat.length = 1; + state->progLength += 3; + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + termStart = state->cp - 1; + num = (uintN)getDecimalValue(c, state); + if (num > 9 && + num > state->parenCount && + !(state->strict)) { + state->cp = termStart; + goto doOctal; + } + state->result = NewRENode(state, REOP_BACKREF); + if (!state->result) + return JS_FALSE; + state->result->u.parenIndex = num - 1; + state->progLength += 3; + break; + /* Control escape */ + case 'f': + c = 0xC; + goto doFlat; + case 'n': + c = 0xA; + goto doFlat; + case 'r': + c = 0xD; + goto doFlat; + case 't': + c = 0x9; + goto doFlat; + case 'v': + c = 0xB; + goto doFlat; + /* Control letter */ + case 'c': + if (((state->cp + 1) < state->cpend) && + RE_IS_LETTER(state->cp[1])) + c = (jschar)(*state->cp++ & 0x1F); + else { + /* back off to accepting the original '\' as a literal */ + --state->cp; + c = '\\'; + } + goto doFlat; + /* HexEscapeSequence */ + case 'x': + nDigits = 2; + goto lexHex; + /* UnicodeEscapeSequence */ + case 'u': + nDigits = 4; +lexHex: + n = 0; + for (i = 0; (i < nDigits) + && (state->cp < state->cpend); i++) { + uintN digit; + c = *state->cp++; + if (!isASCIIHexDigit(c, &digit)) { + /* + * back off to accepting the original + * 'u' or 'x' as a literal + */ + state->cp -= (i + 2); + n = *state->cp++; + break; + } + n = (n << 4) | digit; + } + c = (jschar)(n); + goto doFlat; + /* Character class escapes */ + case 'd': + state->result = NewRENode(state, REOP_DIGIT); +doSimple: + if (!state->result) + return JS_FALSE; + state->progLength++; + break; + case 'D': + state->result = NewRENode(state, REOP_NONDIGIT); + goto doSimple; + case 's': + state->result = NewRENode(state, REOP_SPACE); + goto doSimple; + case 'S': + state->result = NewRENode(state, REOP_NONSPACE); + goto doSimple; + case 'w': + state->result = NewRENode(state, REOP_ALNUM); + goto doSimple; + case 'W': + state->result = NewRENode(state, REOP_NONALNUM); + goto doSimple; + /* IdentityEscape */ + default: + state->result = NewRENode(state, REOP_FLAT); + if (!state->result) + return JS_FALSE; + state->result->u.flat.chr = c; + state->result->u.flat.length = 1; + state->result->kid = (void *)(state->cp - 1); + state->progLength += 3; + break; + } + break; + case '[': + state->result = NewRENode(state, REOP_CLASS); + if (!state->result) + return JS_FALSE; + termStart = state->cp; + state->result->u.ucclass.startIndex = termStart - state->cpbegin; + while (JS_TRUE) { + if (state->cp == state->cpend) { + js_ReportCompileErrorNumber(state->meta, JSMSG_UNTERM_CLASS, termStart); + return JS_FALSE; + } + if (*state->cp == '\\') + state->cp++; + else { + if (*state->cp == ']') { + state->result->u.ucclass.kidlen = state->cp - termStart; + break; + } + } + state->cp++; + } + foundCachedCopy = JS_FALSE; + for (i = 0; i < CLASS_CACHE_SIZE; i++) { + if (state->classCache[i].start) { + if (state->classCache[i].length == state->result->u.ucclass.kidlen) { + foundCachedCopy = JS_TRUE; + for (n = 0; n < state->classCache[i].length; n++) { + if (state->classCache[i].start[n] != termStart[n]) { + foundCachedCopy = JS_FALSE; + break; + } + } + if (foundCachedCopy) { + state->result->u.ucclass.index = state->classCache[i].index; + break; + } + } + } + else { + state->classCache[i].start = termStart; + state->classCache[i].length = state->result->u.ucclass.kidlen; + state->classCache[i].index = state->classCount; + break; + } + } + if (!foundCachedCopy) + state->result->u.ucclass.index = state->classCount++; + /* + * Call calculateBitmapSize now as we want any errors it finds + * to be reported during the parse phase, not at execution. + */ + if (!calculateBitmapSize(state, state->result, termStart, state->cp++)) + return JS_FALSE; + state->progLength += 3; /* CLASS, */ + break; + + case '.': + state->result = NewRENode(state, REOP_DOT); + goto doSimple; + case '*': + case '+': + case '?': + js_ReportCompileErrorNumber(state->meta, JSMSG_BAD_QUANTIFIER, state->cp - 1); + return JS_FALSE; + default: + state->result = NewRENode(state, REOP_FLAT); + if (!state->result) + return JS_FALSE; + state->result->u.flat.chr = c; + state->result->u.flat.length = 1; + state->result->kid = (void *)(state->cp - 1); + state->progLength += 3; + break; + } + return parseQuantifier(state); +} + +static JSBool +parseQuantifier(CompilerState *state) +{ + RENode *term; + term = state->result; + if (state->cp < state->cpend) { + switch (*state->cp) { + case '+': + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + state->result->u.range.min = 1; + state->result->u.range.max = -1; + /* , ... */ + state->progLength += 4; + goto quantifier; + case '*': + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + state->result->u.range.min = 0; + state->result->u.range.max = -1; + /* , ... */ + state->progLength += 4; + goto quantifier; + case '?': + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + state->result->u.range.min = 0; + state->result->u.range.max = 1; + /* , ... */ + state->progLength += 4; + goto quantifier; + case '{': /* balance '}' */ + { + const char *err; + intN min = 0; + intN max = -1; + jschar c; + const jschar *errp = state->cp++; + + c = *state->cp; + if (JS7_ISDEC(c)) { + ++state->cp; + min = getDecimalValue(c, state); + c = *state->cp; + } + else { + /* For Perl etc. compatibility, if a curly is not + * followed by a proper digit, back off from it + * being a quantifier, and chew it up as a literal + * atom next time instead. + */ + --state->cp; + return JS_TRUE; + } + state->result = NewRENode(state, REOP_QUANT); + if (!state->result) + return JS_FALSE; + + if (min >> 16) { + err = JSMSG_MIN_TOO_BIG; + goto quantError; + } + if (c == ',') { + c = *++state->cp; + if (JS7_ISDEC(c)) { + ++state->cp; + max = getDecimalValue(c, state); + c = *state->cp; + if (max >> 16) { + err = JSMSG_MAX_TOO_BIG; + goto quantError; + } + if (min > max) { + err = JSMSG_OUT_OF_ORDER; + goto quantError; + } + } + } + else { + max = min; + } + state->result->u.range.min = min; + state->result->u.range.max = max; + /* QUANT, , , ... */ + state->progLength += 8; + /* balance '{' */ + if (c == '}') + goto quantifier; + else { + err = JSMSG_UNTERM_QUANTIFIER; +quantError: + js_ReportCompileErrorNumber(state->meta, err, errp); + return JS_FALSE; + } + } + } + } + return JS_TRUE; + +quantifier: + ++state->treeDepth; + ++state->cp; + state->result->kid = term; + if ((state->cp < state->cpend) && (*state->cp == '?')) { + ++state->cp; + state->result->u.range.greedy = JS_FALSE; + } + else + state->result->u.range.greedy = JS_TRUE; + return JS_TRUE; +} + +#define CHECK_OFFSET(diff) (JS_ASSERT(((diff) >= -32768) && ((diff) <= 32767))) +#define SET_OFFSET(pc,off) ((pc)[0] = JUMP_OFFSET_HI(off), \ + (pc)[1] = JUMP_OFFSET_LO(off)) +#define GET_OFFSET(pc) ((int16)(((pc)[0] << 8) | (pc)[1])) +#define OFFSET_LEN (2) +#define GET_ARG(pc) GET_OFFSET(pc) +#define SET_ARG(pc,arg) SET_OFFSET(pc,arg) +#define ARG_LEN OFFSET_LEN + +/* + * Recursively generate bytecode for the tree rooted at t. Iteratively. + */ + +typedef struct { + RENode *nextAlt; + jsbytecode *nextAltFixup, *nextTermFixup, *endTermFixup; + RENode *continueNode; + REOp continueOp; +} EmitStateStackEntry; + +static jsbytecode * +emitREBytecode(CompilerState *state, JSRegExp *re, intN treeDepth, + jsbytecode *pc, RENode *t) +{ + ptrdiff_t diff; + RECharSet *charSet; + EmitStateStackEntry *emitStateSP, *emitStateStack = NULL; + REOp op; + + if (treeDepth) { + emitStateStack = + (EmitStateStackEntry *)malloc(sizeof(EmitStateStackEntry) + * treeDepth); + if (!emitStateStack) + return NULL; + } + emitStateSP = emitStateStack; + op = t->op; + + while (JS_TRUE) { + *pc++ = op; + switch (op) { + case REOP_EMPTY: + --pc; + break; + + case REOP_ALTPREREQ2: + case REOP_ALTPREREQ: + JS_ASSERT(emitStateSP); + emitStateSP->endTermFixup = pc; + pc += OFFSET_LEN; + SET_ARG(pc, t->u.altprereq.ch1); + pc += ARG_LEN; + SET_ARG(pc, t->u.altprereq.ch2); + pc += ARG_LEN; + + emitStateSP->nextAltFixup = pc; /* address of next alternate */ + pc += OFFSET_LEN; + + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_JUMP; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + + case REOP_JUMP: + emitStateSP->nextTermFixup = pc; /* address of following term */ + pc += OFFSET_LEN; + diff = pc - emitStateSP->nextAltFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextAltFixup, diff); + emitStateSP->continueOp = REOP_ENDALT; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->u.kid2); + op = t->op; + continue; + + case REOP_ENDALT: + diff = pc - emitStateSP->nextTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextTermFixup, diff); + if (t->op != REOP_ALT) { + diff = pc - emitStateSP->endTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->endTermFixup, diff); + } + break; + + case REOP_ALT: + JS_ASSERT(emitStateSP); + emitStateSP->nextAltFixup = pc; /* address of pointer to next alternate */ + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_JUMP; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + + case REOP_FLAT: + /* + * Consecutize FLAT's if possible. + */ + if (t->kid) { + while (t->next && (t->next->op == REOP_FLAT) + && (((jschar*)(t->kid) + t->u.flat.length) + == (jschar*)(t->next->kid))) { + t->u.flat.length += t->next->u.flat.length; + t->next = t->next->next; + } + } + if (t->kid && (t->u.flat.length > 1)) { + if (state->flags & JSREG_FOLD) + pc[-1] = REOP_FLATi; + else + pc[-1] = REOP_FLAT; + SET_ARG(pc, (jschar *)(t->kid) - state->cpbegin); + pc += ARG_LEN; + SET_ARG(pc, t->u.flat.length); + pc += ARG_LEN; + } + else { + if (t->u.flat.chr < 256) { + if (state->flags & JSREG_FOLD) + pc[-1] = REOP_FLAT1i; + else + pc[-1] = REOP_FLAT1; + *pc++ = (jsbytecode)(t->u.flat.chr); + } + else { + if (state->flags & JSREG_FOLD) + pc[-1] = REOP_UCFLAT1i; + else + pc[-1] = REOP_UCFLAT1; + SET_ARG(pc, t->u.flat.chr); + pc += ARG_LEN; + } + } + break; + + case REOP_LPAREN: + JS_ASSERT(emitStateSP); + SET_ARG(pc, t->u.parenIndex); + pc += ARG_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_RPAREN; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_RPAREN: + SET_ARG(pc, t->u.parenIndex); + pc += ARG_LEN; + break; + + case REOP_BACKREF: + SET_ARG(pc, t->u.parenIndex); + pc += ARG_LEN; + break; + case REOP_ASSERT: + JS_ASSERT(emitStateSP); + emitStateSP->nextTermFixup = pc; + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_ASSERTTEST; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_ASSERTTEST: + case REOP_ASSERTNOTTEST: + diff = pc - emitStateSP->nextTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextTermFixup, diff); + break; + case REOP_ASSERT_NOT: + JS_ASSERT(emitStateSP); + emitStateSP->nextTermFixup = pc; + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_ASSERTNOTTEST; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_QUANT: + JS_ASSERT(emitStateSP); + if ((t->u.range.min == 0) && (t->u.range.max == (uint16)(-1))) + pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR; + else + if ((t->u.range.min == 0) && (t->u.range.max == 1)) + pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT; + else + if ((t->u.range.min == 1) && (t->u.range.max == (uint16)(-1))) + pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; + else { + if (!t->u.range.greedy) pc[-1] = REOP_MINIMALQUANT; + SET_ARG(pc, t->u.range.min); + pc += ARG_LEN; + SET_ARG(pc, t->u.range.max); + pc += ARG_LEN; + } + emitStateSP->nextTermFixup = pc; + pc += OFFSET_LEN; + emitStateSP->continueNode = t; + emitStateSP->continueOp = REOP_ENDCHILD; + ++emitStateSP; + JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth); + t = (RENode *)(t->kid); + op = t->op; + continue; + case REOP_ENDCHILD: + diff = pc - emitStateSP->nextTermFixup; + CHECK_OFFSET(diff); + SET_OFFSET(emitStateSP->nextTermFixup, diff); + break; + case REOP_CLASS: + if (!t->u.ucclass.sense) + pc[-1] = REOP_NCLASS; + SET_ARG(pc, t->u.ucclass.index); + pc += ARG_LEN; + charSet = &re->classList[t->u.ucclass.index]; + charSet->converted = JS_FALSE; + charSet->length = t->u.ucclass.bmsize; + charSet->u.src.startIndex = t->u.ucclass.startIndex; + charSet->u.src.length = t->u.ucclass.kidlen; + charSet->sense = t->u.ucclass.sense; + break; + default: + break; + } + t = t->next; + if (t == NULL) { + if (emitStateSP == emitStateStack) + break; + --emitStateSP; + t = emitStateSP->continueNode; + op = emitStateSP->continueOp; + } + else + op = t->op; + } + if (emitStateStack) + free(emitStateStack); + return pc; +} + +/* + * Save the current state of the match - the position in the input + * text as well as the position in the bytecode. The state of any + * parent expressions is also saved (preceding state). + * Contents of parenCount parentheses from parenIndex are also saved. + */ +static REBackTrackData * +pushBackTrackState(REGlobalData *gData, REOp op, + jsbytecode *target, REMatchState *x, const jschar *cp, + intN parenIndex, intN parenCount) +{ + intN i; + REBackTrackData *result + = (REBackTrackData *)((char *)(gData->backTrackSP) + gData->cursz); + + size_t sz = sizeof(REBackTrackData) + + gData->stateStackTop * sizeof(REProgState) + + parenCount * sizeof(RECapture); + + + if (((char *)result + sz) + > (char *)gData->backTrackStack + gData->maxBackTrack) { + ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack; + gData->backTrackStack + = (REBackTrackData *)realloc(gData->backTrackStack, + gData->maxBackTrack + + gData->maxBackTrack); + gData->maxBackTrack <<= 1; + if (!gData->backTrackStack) + return NULL; + result = (REBackTrackData *)((char *)gData->backTrackStack + offset); + } + gData->backTrackSP = result; + result->sz = gData->cursz; + gData->cursz = sz; + + result->backtrack_op = op; + result->backtrack_pc = target; + result->cp = cp; + result->parenCount = parenCount; + + result->precedingStateTop = gData->stateStackTop; + JS_ASSERT(gData->stateStackTop); + memcpy(result + 1, gData->stateStack, + sizeof(REProgState) * result->precedingStateTop); + + if (parenCount != -1) { + result->parenIndex = parenIndex; + memcpy((char *)(result + 1) + + sizeof(REProgState) * result->precedingStateTop, + &x->parens[parenIndex], + sizeof(RECapture) * parenCount); + for (i = 0; i < parenCount; i++) + x->parens[parenIndex + i].index = -1; + } + + return result; +} + + +/* + * Consecutive literal characters. + */ +static REMatchState * +flatNMatcher(REGlobalData *gData, REMatchState *x, const jschar *matchChars, + intN length) +{ + intN i; + if ((x->cp + length) > gData->cpend) + return NULL; + for (i = 0; i < length; i++) { + if (matchChars[i] != x->cp[i]) + return NULL; + } + x->cp += length; + return x; +} + +static REMatchState * +flatNIMatcher(REGlobalData *gData, REMatchState *x, const jschar *matchChars, + intN length) +{ + intN i; + if ((x->cp + length) > gData->cpend) + return NULL; + for (i = 0; i < length; i++) { + if (canonicalize(matchChars[i]) + != canonicalize(x->cp[i])) + return NULL; + } + x->cp += length; + return x; +} + +/* + * 1. Evaluate DecimalEscape to obtain an EscapeValue E. + * 2. If E is not a character then go to step 6. + * 3. Let ch be E's character. + * 4. Let A be a one-element RECharSet containing the character ch. + * 5. Call CharacterSetMatcher(A, false) and return its Matcher result. + * 6. E must be an integer. Let n be that integer. + * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. + * 8. Return an internal Matcher closure that takes two arguments, a State x + * and a Continuation c, and performs the following: + * 1. Let cap be x's captures internal array. + * 2. Let s be cap[n]. + * 3. If s is undefined, then call c(x) and return its result. + * 4. Let e be x's endIndex. + * 5. Let len be s's length. + * 6. Let f be e+len. + * 7. If f>InputLength, return failure. + * 8. If there exists an integer i between 0 (inclusive) and len (exclusive) + * such that Canonicalize(s[i]) is not the same character as + * Canonicalize(Input [e+i]), then return failure. + * 9. Let y be the State (f, cap). + * 10. Call c(y) and return its result. + */ +static REMatchState * +backrefMatcher(REGlobalData *gData, REMatchState *x, uintN parenIndex) +{ + uintN len; + uintN i; + const jschar *parenContent; + RECapture *s = &x->parens[parenIndex]; + if (s->index == -1) + return x; + + len = s->length; + if ((x->cp + len) > gData->cpend) + return NULL; + + parenContent = &gData->cpbegin[s->index]; + if (gData->regexp->flags & JSREG_FOLD) { + for (i = 0; i < len; i++) { + if (canonicalize(parenContent[i]) + != canonicalize(x->cp[i])) + return NULL; + } + } + else { + for (i = 0; i < len; i++) { + if (parenContent[i] != x->cp[i]) + return NULL; + } + } + x->cp += len; + return x; +} + + +/* Add a single character to the RECharSet */ +static void +addCharacterToCharSet(RECharSet *cs, jschar c) +{ + uintN byteIndex = (uintN)(c / 8); + JS_ASSERT(c <= cs->length); + cs->u.bits[byteIndex] |= 1 << (c & 0x7); +} + + +/* Add a character range, c1 to c2 (inclusive) to the RECharSet */ +static void +addCharacterRangeToCharSet(RECharSet *cs, jschar c1, jschar c2) +{ + uintN i; + + uintN byteIndex1 = (uintN)(c1 / 8); + uintN byteIndex2 = (uintN)(c2 / 8); + + JS_ASSERT((c2 <= cs->length) && (c1 <= c2)); + + c1 &= 0x7; + c2 &= 0x7; + + if (byteIndex1 == byteIndex2) + cs->u.bits[byteIndex1] |= ((uint8)(0xFF) >> (7 - (c2 - c1))) << c1; + else { + cs->u.bits[byteIndex1] |= 0xFF << c1; + for (i = byteIndex1 + 1; i < byteIndex2; i++) + cs->u.bits[i] = 0xFF; + cs->u.bits[byteIndex2] |= (uint8)(0xFF) >> (7 - c2); + } +} + +/* Compile the source of the class into a RECharSet */ +static JSBool +processCharSet(REGlobalData *gData, RECharSet *charSet) +{ + const jschar *src = JSSTRING_CHARS(gData->regexp->source) + + charSet->u.src.startIndex; + const jschar *end = src + charSet->u.src.length; + + jschar rangeStart, thisCh; + uintN byteLength; + jschar c; + uintN n; + intN nDigits; + intN i; + JSBool inRange = JS_FALSE; + + JS_ASSERT(!charSet->converted); + charSet->converted = JS_TRUE; + + byteLength = (charSet->length / 8) + 1; + charSet->u.bits = (uint8 *)malloc(byteLength); + if (!charSet->u.bits) + return JS_FALSE; + memset(charSet->u.bits, 0, byteLength); + + if (src == end) + return JS_TRUE; + + if (*src == '^') { + JS_ASSERT(charSet->sense == JS_FALSE); + ++src; + } + else + JS_ASSERT(charSet->sense == JS_TRUE); + + + while (src != end) { + switch (*src) { + case '\\': + ++src; + c = *src++; + switch (c) { + case 'b': + thisCh = 0x8; + break; + case 'f': + thisCh = 0xC; + break; + case 'n': + thisCh = 0xA; + break; + case 'r': + thisCh = 0xD; + break; + case 't': + thisCh = 0x9; + break; + case 'v': + thisCh = 0xB; + break; + case 'c': + if (((src + 1) < end) && JS_ISWORD(src[1])) + thisCh = (jschar)(*src++ & 0x1F); + else { + --src; + thisCh = '\\'; + } + break; + case 'x': + nDigits = 2; + goto lexHex; + case 'u': + nDigits = 4; +lexHex: + n = 0; + for (i = 0; (i < nDigits) && (src < end); i++) { + uintN digit; + c = *src++; + if (!isASCIIHexDigit(c, &digit)) { + /* + * Back off to accepting the original '\' + * as a literal + */ + src -= (i + 1); + n = '\\'; + break; + } + n = (n << 4) | digit; + } + thisCh = (jschar)(n); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* + * This is a non-ECMA extension - decimal escapes (in this + * case, octal!) are supposed to be an error inside class + * ranges, but supported here for backwards compatibility. + * + */ + n = JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + n = 8 * n + JS7_UNDEC(c); + c = *src; + if ('0' <= c && c <= '7') { + src++; + i = 8 * n + JS7_UNDEC(c); + if (i <= 0377) + n = i; + else + src--; + } + } + thisCh = (jschar)(n); + break; + + case 'd': + addCharacterRangeToCharSet(charSet, '0', '9'); + continue; /* don't need range processing */ + case 'D': + addCharacterRangeToCharSet(charSet, 0, '0' - 1); + addCharacterRangeToCharSet(charSet, (jschar)('9' + 1), + (jschar)(charSet->length)); + continue; + case 's': + for (i = (intN)(charSet->length); i >= 0; i--) + if (JS_ISSPACE(i)) + addCharacterToCharSet(charSet, (jschar)(i)); + continue; + case 'S': + for (i = (intN)(charSet->length); i >= 0; i--) + if (!JS_ISSPACE(i)) + addCharacterToCharSet(charSet, (jschar)(i)); + continue; + case 'w': + for (i = (intN)(charSet->length); i >= 0; i--) + if (JS_ISWORD(i)) + addCharacterToCharSet(charSet, (jschar)(i)); + continue; + case 'W': + for (i = (intN)(charSet->length); i >= 0; i--) + if (!JS_ISWORD(i)) + addCharacterToCharSet(charSet, (jschar)(i)); + continue; + default: + thisCh = c; + break; + + } + break; + + default: + thisCh = *src++; + break; + + } + if (inRange) { + if (gData->regexp->flags & JSREG_FOLD) { + jschar minch = (jschar)65535; + jschar maxch = 0; + /* + + yuk + + */ + if (rangeStart < minch) minch = rangeStart; + if (thisCh < minch) minch = thisCh; + if (canonicalize(rangeStart) < minch) + minch = canonicalize(rangeStart); + if (canonicalize(thisCh) < minch) minch = canonicalize(thisCh); + + if (rangeStart > maxch) maxch = rangeStart; + if (thisCh > maxch) maxch = thisCh; + if (canonicalize(rangeStart) > maxch) + maxch = canonicalize(rangeStart); + if (canonicalize(thisCh) > maxch) maxch = canonicalize(thisCh); + addCharacterRangeToCharSet(charSet, minch, maxch); + } + else + addCharacterRangeToCharSet(charSet, rangeStart, thisCh); + inRange = JS_FALSE; + } + else { + if (gData->regexp->flags & JSREG_FOLD) + addCharacterToCharSet(charSet, canonicalize(thisCh)); + addCharacterToCharSet(charSet, thisCh); + if (src < (end - 1)) { + if (*src == '-') { + ++src; + inRange = JS_TRUE; + rangeStart = thisCh; + } + } + } + } + return JS_TRUE; +} + +void +js_DestroyRegExp(JSRegExp *re) +{ + uintN i; + if (re->classList) { + for (i = 0; i < re->classCount; i++) { + if (re->classList[i].converted) + free(re->classList[i].u.bits); + re->classList[i].u.bits = NULL; + } + free(re->classList); + } +} + +static JSBool +reallocStateStack(REGlobalData *gData) +{ + size_t sz = sizeof(REProgState) * gData->maxStateStack; + gData->maxStateStack <<= 1; + gData->stateStack + = (REProgState *)realloc(gData->stateStack, sz + sz); + if (!gData->stateStack) { + gData->ok = JS_FALSE; + return JS_FALSE; + } + return JS_TRUE; +} + +/* +* Apply the current op against the given input to see if +* it's going to match or fail. Return false if we don't +* get a match, true if we do and update the state of the +* input and pc if the update flag is true. +*/ +static REMatchState *simpleMatch(REGlobalData *gData, REMatchState *x, + REOp op, jsbytecode **startpc, JSBool update) +{ + REMatchState *result = NULL; + jschar matchCh; + intN parenIndex; + intN offset, length, index; + jsbytecode *pc = *startpc; /* pc has already been incremented past op */ + const jschar *source; + const jschar *startcp = x->cp; + jschar ch; + RECharSet *charSet; + + + switch (op) { + default: + JS_ASSERT(JS_FALSE); + case REOP_BOL: + if (x->cp != gData->cpbegin) { + if (gData->globalMultiline || + (gData->regexp->flags & JSREG_MULTILINE)) { + if (!RE_IS_LINE_TERM(x->cp[-1])) + break; + } + else + break; + } + result = x; + break; + case REOP_EOL: + if (x->cp != gData->cpend) { + if (gData->globalMultiline || + (gData->regexp->flags & JSREG_MULTILINE)) { + if (!RE_IS_LINE_TERM(*x->cp)) + break; + } + else + break; + } + result = x; + break; + case REOP_WBDRY: + if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) + ^ !((x->cp != gData->cpend) && JS_ISWORD(*x->cp))) + result = x; + break; + case REOP_WNONBDRY: + if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) + ^ ((x->cp != gData->cpend) && JS_ISWORD(*x->cp))) + result = x; + break; + case REOP_DOT: + if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_DIGIT: + if (x->cp != gData->cpend && JS_ISDIGIT(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_NONDIGIT: + if (x->cp != gData->cpend && !JS_ISDIGIT(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_ALNUM: + if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_NONALNUM: + if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_SPACE: + if (x->cp != gData->cpend && JS_ISSPACE(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_NONSPACE: + if (x->cp != gData->cpend && !JS_ISSPACE(*x->cp)) { + result = x; + result->cp++; + } + break; + case REOP_BACKREF: + parenIndex = GET_ARG(pc); + pc += ARG_LEN; + result = backrefMatcher(gData, x, parenIndex); + break; + case REOP_FLAT: + offset = GET_ARG(pc); + pc += ARG_LEN; + length = GET_ARG(pc); + pc += ARG_LEN; + source = JSSTRING_CHARS(gData->regexp->source) + offset; + if ((x->cp + length) <= gData->cpend) { + for (index = 0; index < length; index++) { + if (source[index] != x->cp[index]) + return NULL; + } + x->cp += length; + result = x; + } + break; + case REOP_FLAT1: + matchCh = *pc++; + if ((x->cp != gData->cpend) && (*x->cp == matchCh)) { + result = x; + result->cp++; + } + break; + case REOP_FLATi: + offset = GET_ARG(pc); + pc += ARG_LEN; + length = GET_ARG(pc); + pc += ARG_LEN; + source = JSSTRING_CHARS(gData->regexp->source); + result = flatNIMatcher(gData, x, source + offset, length); + break; + case REOP_FLAT1i: + matchCh = *pc++; + if ((x->cp != gData->cpend) + && (canonicalize(*x->cp) == canonicalize(matchCh))) { + result = x; + result->cp++; + } + break; + case REOP_UCFLAT1: + matchCh = GET_ARG(pc); + pc += ARG_LEN; + if ((x->cp != gData->cpend) && (*x->cp == matchCh)) { + result = x; + result->cp++; + } + break; + case REOP_UCFLAT1i: + matchCh = GET_ARG(pc); + pc += ARG_LEN; + if ((x->cp != gData->cpend) + && (canonicalize(*x->cp) == canonicalize(matchCh))) { + result = x; + result->cp++; + } + break; + case REOP_CLASS: + index = GET_ARG(pc); + pc += ARG_LEN; + if (x->cp != gData->cpend) { + charSet = &gData->regexp->classList[index]; + JS_ASSERT(charSet->converted); + ch = *x->cp; + index = ch / 8; + if ((charSet->length != 0) && + ( (ch <= charSet->length) + && ((charSet->u.bits[index] & (1 << (ch & 0x7))) != 0) )) { + result = x; + result->cp++; + } + } + break; + case REOP_NCLASS: + index = GET_ARG(pc); + pc += ARG_LEN; + if (x->cp != gData->cpend) { + charSet = &gData->regexp->classList[index]; + JS_ASSERT(charSet->converted); + ch = *x->cp; + index = ch / 8; + if ((charSet->length == 0) || + ( (ch > charSet->length) + || ((charSet->u.bits[index] & (1 << (ch & 0x7))) == 0) )) { + result = x; + result->cp++; + } + } + break; + } + if (result != NULL) { + if (update) + *startpc = pc; + else + x->cp = startcp; + return result; + } + x->cp = startcp; + return NULL; +} + +static REMatchState * +executeREBytecode(REGlobalData *gData, REMatchState *x) +{ + REMatchState *result; + REBackTrackData *backTrackData; + intN offset; + jsbytecode *nextpc; + REOp nextop; + RECapture *cap; + REProgState *curState; + const jschar *startcp; + uintN parenIndex, k; + uintN parenSoFar = 0; + + jschar matchCh1, matchCh2; + RECharSet *charSet; + + JSBool anchor; + jsbytecode *pc = gData->regexp->program; + REOp op = (REOp)(*pc++); + + /* + * If the first node is a simple match, step the index into + * the string until that match is made, or fail if it can't be + * found at all. + */ + if (REOP_IS_SIMPLE(op)) { + anchor = JS_FALSE; + while (x->cp <= gData->cpend) { + nextpc = pc; /* reset back to start each time */ + result = simpleMatch(gData, x, op, &nextpc, JS_TRUE); + if (result) { + anchor = JS_TRUE; + x = result; + pc = nextpc; /* accept skip to next opcode */ + op = (REOp)(*pc++); + break; + } + else { + gData->skipped++; + x->cp++; + } + } + if (!anchor) + return NULL; + } + + while (JS_TRUE) { + if (REOP_IS_SIMPLE(op)) + result = simpleMatch(gData, x, op, &pc, JS_TRUE); + else { + curState = &gData->stateStack[gData->stateStackTop]; + switch (op) { + case REOP_EMPTY: + result = x; + break; + + case REOP_ALTPREREQ2: + nextpc = pc + GET_OFFSET(pc); /* start of next op */ + pc += ARG_LEN; + matchCh2 = GET_ARG(pc); + pc += ARG_LEN; + k = GET_ARG(pc); + pc += ARG_LEN; + + if (x->cp != gData->cpend) { + charSet = &gData->regexp->classList[k]; + if (!charSet->converted) + if (!processCharSet(gData, charSet)) + return JS_FALSE; + matchCh1 = *x->cp; + k = matchCh1 / 8; + if ((charSet->length != 0) && + ( (matchCh1 <= charSet->length) + && ((charSet->u.bits[k] + & (1 << (matchCh1 & 0x7))) != 0) )) { + result = NULL; + break; + } + } + else { + result = NULL; + break; + } + + if ((x->cp == gData->cpend) || (*x->cp != matchCh2)) { + result = NULL; + break; + } + goto doAlt; + + case REOP_ALTPREREQ: + nextpc = pc + GET_OFFSET(pc); /* start of next op */ + pc += ARG_LEN; + matchCh1 = GET_ARG(pc); + pc += ARG_LEN; + matchCh2 = GET_ARG(pc); + pc += ARG_LEN; + if ((x->cp == gData->cpend) + || ((*x->cp != matchCh1) && (*x->cp != matchCh2))) { + result = NULL; + break; + } + /* else false thru... */ + + case REOP_ALT: +doAlt: + nextpc = pc + GET_OFFSET(pc); /* start of next alternate */ + pc += ARG_LEN; /* start of this alternate */ + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + op = (REOp)(*pc++); + startcp = x->cp; + if (REOP_IS_SIMPLE(op)) { + if (!simpleMatch(gData, x, op, &pc, JS_TRUE)) { + op = (REOp)(*nextpc++); + pc = nextpc; + continue; + } + else { /* accept the match and move on */ + result = x; + op = (REOp)(*pc++); + } + } + nextop = (REOp)(*nextpc++); + if (!pushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0)) + return NULL; + continue; + + /* + * Occurs at (succesful) end of REOP_ALT, + */ + case REOP_JUMP: + --gData->stateStackTop; + offset = GET_OFFSET(pc); + pc += offset; + op = (REOp)(*pc++); + continue; + + /* + * Occurs at last (succesful) end of REOP_ALT, + */ + case REOP_ENDALT: + --gData->stateStackTop; + op = (REOp)(*pc++); + continue; + + case REOP_LPAREN: + parenIndex = GET_ARG(pc); + if ((parenIndex + 1) > parenSoFar) + parenSoFar = parenIndex + 1; + pc += ARG_LEN; + x->parens[parenIndex].index = x->cp - gData->cpbegin; + x->parens[parenIndex].length = 0; + op = (REOp)(*pc++); + continue; + case REOP_RPAREN: + parenIndex = GET_ARG(pc); + pc += ARG_LEN; + cap = &x->parens[parenIndex]; + cap->length = x->cp - (gData->cpbegin + cap->index); + op = (REOp)(*pc++); + continue; + + case REOP_ASSERT: + nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */ + pc += ARG_LEN; /* start of ASSERT child */ + op = (REOp)(*pc++); + if (REOP_IS_SIMPLE(op) + && !simpleMatch(gData, x, op, &pc, JS_FALSE)) { + result = NULL; + break; + } + else { + curState->u.assertion.top + = (char *)gData->backTrackSP + - (char *)gData->backTrackStack; + curState->u.assertion.sz = gData->cursz; + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (!pushBackTrackState(gData, REOP_ASSERTTEST, + nextpc, x, x->cp, 0, 0)) + return NULL; + } + continue; + case REOP_ASSERT_NOT: + nextpc = pc + GET_OFFSET(pc); + pc += ARG_LEN; + op = (REOp)(*pc++); + if (REOP_IS_SIMPLE(op) + /* Note - fail to fail! */ + && simpleMatch(gData, x, op, &pc, JS_FALSE)) { + result = NULL; + break; + } + else { + curState->u.assertion.top + = (char *)gData->backTrackSP + - (char *)gData->backTrackStack; + curState->u.assertion.sz = gData->cursz; + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (!pushBackTrackState(gData, REOP_ASSERTNOTTEST, + nextpc, x, x->cp, 0, 0)) + return NULL; + } + continue; + case REOP_ASSERTTEST: + --gData->stateStackTop; + --curState; + x->cp = gData->cpbegin + curState->index; + gData->backTrackSP + = (REBackTrackData *)((char *)gData->backTrackStack + + curState->u.assertion.top); + gData->cursz = curState->u.assertion.sz; + if (result != NULL) + result = x; + break; + case REOP_ASSERTNOTTEST: + --gData->stateStackTop; + --curState; + x->cp = gData->cpbegin + curState->index; + gData->backTrackSP + = (REBackTrackData *)((char *)gData->backTrackStack + + curState->u.assertion.top); + gData->cursz = curState->u.assertion.sz; + if (result == NULL) + result = x; + else + result = NULL; + break; + + case REOP_END: + if (x != NULL) + return x; + break; + + case REOP_STAR: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = -1; + goto quantcommon; + case REOP_PLUS: + curState->u.quantifier.min = 1; + curState->u.quantifier.max = -1; + goto quantcommon; + case REOP_OPT: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = 1; + goto quantcommon; + case REOP_QUANT: + curState->u.quantifier.min = GET_ARG(pc); + pc += ARG_LEN; + curState->u.quantifier.max = GET_ARG(pc); + pc += ARG_LEN; +quantcommon: + if (curState->u.quantifier.max == 0) { + pc = pc + GET_OFFSET(pc); + op = (REOp)(*pc++); + result = x; + continue; + } + /* Step over */ + nextpc = pc + ARG_LEN; + op = (REOp)(*nextpc++); + startcp = x->cp; + if (REOP_IS_SIMPLE(op)) { + if (!simpleMatch(gData, x, op, &nextpc, JS_TRUE)) { + if (curState->u.quantifier.min == 0) + result = x; + else + result = NULL; + pc = pc + GET_OFFSET(pc); + break; + } + else { + op = (REOp)(*nextpc++); + result = x; + } + } + curState->index = startcp - gData->cpbegin; + curState->continue_op = REOP_REPEAT; + curState->continue_pc = pc; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (curState->u.quantifier.min == 0) + if (!pushBackTrackState(gData, REOP_REPEAT, + pc, x, startcp, 0, 0)) + return NULL; + pc = nextpc; + continue; + + case REOP_ENDCHILD: /* marks the end of a quantifier child */ + pc = curState[-1].continue_pc; + op = curState[-1].continue_op; + continue; + + case REOP_REPEAT: + --curState; +repeatAgain: + --gData->stateStackTop; + if (result == NULL) { + /* + * There's been a failure, see if we have enough children. + */ + if (curState->u.quantifier.min == 0) { + result = x; + goto repeatDone; + } + break; + } + else { + if ((curState->u.quantifier.min == 0) + && (x->cp == gData->cpbegin + curState->index)) { + /* matched an empty string, that'll get us nowhere */ + result = NULL; + break; + } + if (curState->u.quantifier.min != 0) + curState->u.quantifier.min--; + if (curState->u.quantifier.max != (uint16)(-1)) + curState->u.quantifier.max--; + if (curState->u.quantifier.max == 0) { + result = x; + goto repeatDone; + } + nextpc = pc + ARG_LEN; + nextop = (REOp)(*nextpc); + startcp = x->cp; + if (REOP_IS_SIMPLE(nextop)) { + nextpc++; + if (!simpleMatch(gData, x, nextop, &nextpc, JS_TRUE)) { + if (curState->u.quantifier.min == 0) { + result = x; + goto repeatDone; + } + else + result = NULL; + break; + } + result = x; + } + curState->index = startcp - gData->cpbegin; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (curState->u.quantifier.min == 0) + if (!pushBackTrackState(gData, REOP_REPEAT, + pc, x, startcp, + curState->parenSoFar, + parenSoFar + - curState->parenSoFar)) + return NULL; + if (*nextpc == REOP_ENDCHILD) + goto repeatAgain; + pc = nextpc; + op = (REOp)(*pc++); + parenSoFar = curState->parenSoFar; + } + continue; +repeatDone: + pc = pc + GET_OFFSET(pc); + break; + + + case REOP_MINIMALSTAR: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = -1; + goto minimalquantcommon; + case REOP_MINIMALPLUS: + curState->u.quantifier.min = 1; + curState->u.quantifier.max = -1; + goto minimalquantcommon; + case REOP_MINIMALOPT: + curState->u.quantifier.min = 0; + curState->u.quantifier.max = 1; + goto minimalquantcommon; + case REOP_MINIMALQUANT: + curState->u.quantifier.min = GET_ARG(pc); + pc += ARG_LEN; + curState->u.quantifier.max = GET_ARG(pc); + pc += ARG_LEN; +minimalquantcommon: + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (curState->u.quantifier.min != 0) { + curState->continue_op = REOP_MINIMALREPEAT; + curState->continue_pc = pc; + /* step over */ + pc += ARG_LEN; + op = (REOp)(*pc++); + } + else { + if (!pushBackTrackState(gData, REOP_MINIMALREPEAT, + pc, x, x->cp, 0, 0)) + return NULL; + --gData->stateStackTop; + pc = pc + GET_OFFSET(pc); + op = (REOp)(*pc++); + } + continue; + + case REOP_MINIMALREPEAT: + --gData->stateStackTop; + --curState; + + if (result == NULL) { + /* + * Non-greedy failure - try to consume another child. + */ + if ((curState->u.quantifier.max == (uint16)(-1)) + || (curState->u.quantifier.max > 0)) { + curState->index = x->cp - gData->cpbegin; + curState->continue_op = REOP_MINIMALREPEAT; + curState->continue_pc = pc; + pc += ARG_LEN; + for (k = curState->parenSoFar; k < parenSoFar; k++) + x->parens[k].index = -1; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + op = (REOp)(*pc++); + continue; + } + else { + /* Don't need to adjust pc since we're going to pop. */ + break; + } + } + else { + if ((curState->u.quantifier.min == 0) + && (x->cp == gData->cpbegin + curState->index)) { + /* Matched an empty string, that'll get us nowhere. */ + result = NULL; + break; + } + if (curState->u.quantifier.min != 0) + curState->u.quantifier.min--; + if (curState->u.quantifier.max != (uint16)(-1)) + curState->u.quantifier.max--; + if (curState->u.quantifier.min != 0) { + curState->continue_op = REOP_MINIMALREPEAT; + curState->continue_pc = pc; + pc += ARG_LEN; + for (k = curState->parenSoFar; k < parenSoFar; k++) + x->parens[k].index = -1; + curState->index = x->cp - gData->cpbegin; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + op = (REOp)(*pc++); + continue; + } + else { + curState->index = x->cp - gData->cpbegin; + curState->parenSoFar = parenSoFar; + ++gData->stateStackTop; + if (gData->stateStackTop == gData->maxStateStack) + if (!reallocStateStack(gData)) + return NULL; + if (!pushBackTrackState(gData, REOP_MINIMALREPEAT, + pc, x, x->cp, + curState->parenSoFar, + parenSoFar + - curState->parenSoFar)) + return NULL; + --gData->stateStackTop; + pc = pc + GET_OFFSET(pc); + op = (REOp)(*pc++); + continue; + } + } + + default: + JS_ASSERT(JS_FALSE); + + } + } + /* + * If the match failed and there's a backtrack option, take it. + * Otherwise this is a complete and utter failure. + */ + if (result == NULL) { + if (gData->cursz > 0) { + backTrackData = gData->backTrackSP; + gData->cursz = backTrackData->sz; + gData->backTrackSP + = (REBackTrackData *)((char *)backTrackData + - backTrackData->sz); + x->cp = backTrackData->cp; + pc = backTrackData->backtrack_pc; + op = backTrackData->backtrack_op; + gData->stateStackTop = backTrackData->precedingStateTop; + JS_ASSERT(gData->stateStackTop); + + memcpy(gData->stateStack, backTrackData + 1, + sizeof(REProgState) * backTrackData->precedingStateTop); + curState = &gData->stateStack[gData->stateStackTop - 1]; + + if (backTrackData->parenCount) { + memcpy(&x->parens[backTrackData->parenIndex], + (char *)(backTrackData + 1) + sizeof(REProgState) * backTrackData->precedingStateTop, + sizeof(RECapture) * backTrackData->parenCount); + parenSoFar = backTrackData->parenIndex + backTrackData->parenCount; + } + else { + for (k = curState->parenSoFar; k < parenSoFar; k++) + x->parens[k].index = -1; + parenSoFar = curState->parenSoFar; + } + continue; + } + else + return NULL; + } + else + x = result; + + /* + * Continue with the expression. + */ + op = (REOp)*pc++; + } + return NULL; +} + +static REMatchState * +MatchRegExp(REGlobalData *gData, REMatchState *x) +{ + REMatchState *result; + const jschar *cp = x->cp; + const jschar *cp2; + uintN j; + + /* + * Have to include the position beyond the last character + * in order to detect end-of-input/line condition. + */ + for (cp2 = cp; cp2 <= gData->cpend; cp2++) { + gData->skipped = cp2 - cp; + x->cp = cp2; + for (j = 0; j < gData->regexp->parenCount; j++) + x->parens[j].index = -1; + result = executeREBytecode(gData, x); + if (!gData->ok || result) + return result; + gData->backTrackSP = gData->backTrackStack; + gData->cursz = 0; + gData->stateStackTop = 0; + cp2 = cp + gData->skipped; + } + return NULL; +} + + +static REMatchState * +initMatch(REGlobalData *gData, JSRegExp *re) +{ + REMatchState *result; + uintN i; + + gData->maxBackTrack = INITIAL_BACKTRACK; + gData->backTrackStack = (REBackTrackData *)malloc(INITIAL_BACKTRACK); + + if (!gData->backTrackStack) + return NULL; + gData->backTrackSP = gData->backTrackStack; + gData->cursz = 0; + + + gData->maxStateStack = INITIAL_STATESTACK; + gData->stateStack = (REProgState *)malloc(sizeof(REProgState) * INITIAL_STATESTACK); + if (!gData->stateStack) + return NULL; + gData->stateStackTop = 0; + + gData->regexp = re; + gData->ok = JS_TRUE; + + result = (REMatchState *)malloc(sizeof(REMatchState) + + (re->parenCount - 1) * sizeof(RECapture)); + if (!result) + return NULL; + + for (i = 0; i < re->classCount; i++) + if (!re->classList[i].converted) + if (!processCharSet(gData, &re->classList[i])) + return NULL; + + return result; +} + +/* + * Call the recursive matcher to do the real work. Return null on mismatch. + * On match, return the completed MatchResult structure. + */ +REMatchResult *REExecute(JS2Metadata *meta, JSRegExp *re, const jschar *str, uint32 index, uint32 length, bool globalMultiline) +{ + REGlobalData gData; + REMatchState *x, *result; + const jschar *cp; + uint32 start; + + start = index; + if (start > length) + start = length; + cp = JSSTRING_CHARS(str); + gData.cpbegin = cp; + gData.cpend = cp + length; + cp += start; + gData.start = start; + gData.skipped = 0; + + x = initMatch(&gData, re); + gData.globalMultiline = globalMultiline; + if (!x) + return JS_FALSE; + x->cp = cp; + + result = MatchRegExp(&gData, x); + if (!gData.ok) + return NULL; + if (!result) + return NULL; + + REMatchResult *returnValue = (REMatchResult *)malloc(sizeof(REMatchResult) + (re->parenCount - 1) * sizeof(RECapture)); + returnValue->startIndex = gData.start + gData.skipped; + returnValue->endIndex = result->cp - str; + returnValue->parenCount = re->parenCount; + for (uint32 p = 0; p < re->parenCount; p++) { + returnValue->parens[p] = result->parens[p]; + } + return returnValue; +} + +REMatchResult *REMatch(JS2Metadata *meta, JSRegExp *re, const jschar *str, uint32 length) +{ + REGlobalData gData; + REMatchState *x, *result; + const jschar *cp; + uint32 j; + + cp = JSSTRING_CHARS(str); + gData.cpbegin = cp; + gData.cpend = cp + length; + gData.start = 0; + gData.skipped = 0; + + x = initMatch(&gData, re); + gData.globalMultiline = false; + if (!x) + return JS_FALSE; + x->cp = cp; + + for (j = 0; j < re->parenCount; j++) + x->parens[j].index = -1; + result = executeREBytecode(&gData, x); + if (!gData.ok) + return NULL; + if (!result) + return NULL; + + REMatchResult *returnValue = (REMatchResult *)malloc(sizeof(REMatchResult) + (re->parenCount - 1) * sizeof(RECapture)); + returnValue->startIndex = gData.skipped; + returnValue->endIndex = result->cp - str; + returnValue->parenCount = re->parenCount; + for (uint32 p = 0; p < re->parenCount; p++) { + returnValue->parens[p] = result->parens[p]; + } + return returnValue; +} + +// Compile the flag source and build a flag bit set. Return true/false for success/failure +bool parseFlags(JS2Metadata *meta, const jschar *flagStr, uint32 length, uint32 *flags) +{ + uint32 i; + *flags = 0; + for (i = 0; i < length; i++) { + switch (flagStr[i]) { + case 'g': + *flags |= JSREG_GLOB; break; + case 'i': + *flags |= JSREG_FOLD; break; + case 'm': + *flags |= JSREG_MULTILINE; break; + default: + return false; + } + } + return true; +} + +#define JS_HOWMANY(x,y) (((x)+(y)-1)/(y)) +#define JS_ROUNDUP(x,y) (JS_HOWMANY(x,y)*(y)) + +// Compile the source re, return NULL for failure (error functions called) +JSRegExp *RECompile(JS2Metadata *meta, const jschar *str, uint32 length, uint32 flags) +{ + JSRegExp *re; + CompilerState state; + size_t resize; + jsbytecode *endPC; + uint32 i; + size_t len; + + re = NULL; + state.meta = meta; + state.reNodePool = new Pool(32); + state.strict = false; + state.cpbegin = state.cp = JSSTRING_CHARS(str); + state.cpend = state.cp + length; + state.flags = flags; + state.parenCount = 0; + state.classCount = 0; + state.progLength = 0; + state.treeDepth = 0; + for (i = 0; i < CLASS_CACHE_SIZE; i++) + state.classCache[i].start = NULL; + + len = length; + if (!parseRegExp(&state)) + goto out; + + resize = sizeof *re + state.progLength + 1; + re = (JSRegExp *) malloc(JS_ROUNDUP(resize, sizeof(uint32))); + if (!re) + goto out; + + re->classCount = state.classCount; + if (state.classCount) { + re->classList = (RECharSet *)malloc(sizeof(RECharSet) + * state.classCount); + if (!re->classList) + goto out; + } + else + re->classList = NULL; + endPC = emitREBytecode(&state, re, state.treeDepth, re->program, state.result); + if (!endPC) { + re = NULL; + goto out; + } + *endPC++ = REOP_END; + JS_ASSERT(endPC <= (re->program + (state.progLength + 1))); + + re->parenCount = state.parenCount; + re->flags = flags; + re->source = str; + +out: + delete state.reNodePool; + return re; +} + +} // namespace MetaData +} // namespace JavaScript diff --git a/mozilla/js2/src/strings.h b/mozilla/js2/src/strings.h index 040c8521e51..76146505ac1 100644 --- a/mozilla/js2/src/strings.h +++ b/mozilla/js2/src/strings.h @@ -240,4 +240,8 @@ namespace JavaScript const char16 *skipWhiteSpace(const char16 *str, const char16 *strEnd); } + +#define JS7_ISHEX(c) ((c) < 128 && isxdigit(c)) +#define JS7_UNHEX(c) (uintN)(isdigit(c) ? (c) - '0' : 10 + tolower(c) - 'a') + #endif /* strings_h___ */