Mozilla/mozilla/js/tamarin/core/RegExpObject.cpp

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is [Open Source Virtual Machine.].
 *
 * The Initial Developer of the Original Code is
 * Adobe System Incorporated.
 * Portions created by the Initial Developer are Copyright (C) 2004-2006
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Adobe AS3 Team
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */


#include "avmplus.h"

#include "pcre.h"

// todo figure out what to do about all the new/delete in here
// todo general clean-up

namespace avmplus
{
	using namespace MMgc;

	// UsesUTF8String is a helper class which converts a Stringp to UTF8String,
	// and frees it afterwards to reduce GC pressure.
	class UsesUTF8String
	{
	public:
		UsesUTF8String(Stringp subject)
			: m_utf8String(subject ? subject->toUTF8String() : NULL)
		{
		}
		~UsesUTF8String()
		{
			if (m_utf8String)
			{
				MMgc::GC* gc = MMgc::GC::GetGC(m_utf8String);
				gc->Free(m_utf8String);
			}
		}
		operator UTF8String* const () { return m_utf8String; }
		UTF8String* operator->() const
		{
			return m_utf8String;
		}

	private:
		UTF8String* const m_utf8String;
	};

#define OVECTOR_SIZE 99 // 32 matches = (32+1)*3

	// This variant is only used for creating the prototype
	RegExpObject::RegExpObject(RegExpClass *regExpClass,
							   ScriptObject *objectPrototype)
		: ScriptObject(regExpClass->ivtable(), objectPrototype)
	{
		AvmAssert(traits()->sizeofInstance == sizeof(RegExpObject));

		GC::SetFinalize(this);

		int			errptr;
		const char	*error;
		AvmCore		*core = this->core();
		m_optionFlags = PCRE_UTF8;
		m_hasNamedGroups = false;
		m_source = core->newString("(?:)");

		UsesUTF8String utf8Pattern(m_source);
		m_pcreInst = (void*)pcre_compile(utf8Pattern->c_str(), m_optionFlags, &error, &errptr, NULL );
	}

	RegExpObject::RegExpObject(RegExpObject *toCopy)
		: ScriptObject(toCopy->vtable, toCopy->getDelegate())
	{
		AvmAssert(traits()->sizeofInstance == sizeof(RegExpObject));

		GC::SetFinalize(this);

		m_source = toCopy->m_source;
		m_global = toCopy->m_global;
		m_lastIndex = 0;
		m_optionFlags = toCopy->m_optionFlags;
		m_hasNamedGroups = toCopy->m_hasNamedGroups;

		UsesUTF8String utf8Pattern(m_source);
		int errptr;
		const char *error;
		m_pcreInst = (void*)pcre_compile(utf8Pattern->c_str(), m_optionFlags, &error, &errptr, NULL );
	}

	RegExpObject::RegExpObject(RegExpClass *type,
							   Stringp pattern,
							   Stringp options)
	   : ScriptObject(type->ivtable(), type->prototype)
	{
		AvmAssert(traits()->sizeofInstance == sizeof(RegExpObject));
		m_source = pattern;

		GC::SetFinalize(this);

		m_lastIndex = 0;
		m_global = false;
		int errptr;
		const char *error;

		m_optionFlags = PCRE_UTF8;

		UsesUTF8String utf8Pattern(pattern);

		// Check for named groups and embedded options if optionStr is NULL. ( Needed to handle
		//  new RegExp( existingRegExpValue.toString() ) )
		const char *ptr = utf8Pattern->c_str();
		UsesUTF8String optionUTF8(options);
		const char* optionStr =   optionUTF8 ? optionUTF8->c_str() : NULL;

		m_hasNamedGroups = false;
		int numSlashSeen = 0;

		while (*ptr) {
			if (ptr[0] == '(' &&
				ptr[1] == '?' &&
				ptr[2] == 'P' &&
				ptr[3] == '<')
			{
				m_hasNamedGroups = true;
			}
			else if (optionStr == NULL && ptr[0] == '/' && (ptr == utf8Pattern->c_str() || ptr[-1] != '\\') && numSlashSeen++ > 0)
			{
				optionStr = ptr;
			}

			ptr++;
		}

		// check options
		if (optionStr)
		{
			for(; *optionStr; optionStr++)
			{
				switch(*optionStr)
				{
				case 'g':
					m_global = true;
					break;
				case 'i':
					m_optionFlags |= PCRE_CASELESS;
					break;
				case 'm':
					m_optionFlags |= PCRE_MULTILINE;
					break;
				case 's':
					m_optionFlags |= PCRE_DOTALL;
					break;
				case 'x':
					m_optionFlags |= PCRE_EXTENDED;
					break;
				}
			}
		}


		m_pcreInst = (void*)pcre_compile(utf8Pattern->c_str(), m_optionFlags, &error, &errptr, NULL );
		// FIXME: make errors available to actionscript
	}

	RegExpObject::~RegExpObject()
	{
		(pcre_free)((pcre*)m_pcreInst);
		m_global = false;
		m_lastIndex = 0;
		m_optionFlags = 0;
		m_hasNamedGroups = false;
		m_pcreInst = NULL;
	}

	// this = argv[0]
	// arg1 = argv[1]
	// argN = argv[argc]
	Atom RegExpObject::call(int argc, Atom *argv)
	{
		// this call occurs when a regexp object is invoked directly as a function ala "/a|b/('dcab')"
		AvmCore *core = this->core();
		Atom inAtom = argc ? core->string(argv[1])->atom() : core->kEmptyString->atom();
		return execSimple(core->atomToString(inAtom));
	}

    /**
     * read a string.  No resource table stuff here, caller must take care of it.
     * @param code
     * @return
     */
    Atom RegExpObject::stringFromUTF8(const char *buffer,
									  int len)
    {
        // don't need to create an atom for this now, because
        // each caller will take care of it.
		if (len == 0) {
			return core()->kEmptyString->atom();
		} else {
			return core()->newString(buffer, len)->atom();
		}
    }

	int RegExpObject::search(Stringp subject)
	{
		int matchIndex, matchLen;
		UsesUTF8String utf8Subject(subject);
		if (!exec(subject, utf8Subject, 0, matchIndex, matchLen))
		{
			matchIndex = -1;
		}
		else
		{
			matchIndex = Utf8ToUtf16Index(subject, utf8Subject, matchIndex);
		}
		return matchIndex;
	}

	int RegExpObject::Utf8ToUtf16Index(Stringp utf16String,
									   UTF8String *utf8String,
									   int utf8Index)
	{
		int utf16Length = utf16String->length();
		int utf8Length  = utf8String->length();

		// If the UTF-16 and UTF-8 strings contain the same number of
		// characters, the string is plain ASCII and no conversion is needed.
		if (utf16Length == utf8Length)
		{
			return utf8Index;
		}

		// If the UTF-8 index is out of range, do nothing.
		if (utf8Index < 0 || utf8Index > utf8Length)
		{
			return utf8Index;
		}

		return UnicodeUtils::Utf8ToUtf16((const uint8*)utf8String->c_str(),
										 utf8Index,
										 NULL,
										 0);
	}

	int RegExpObject::numBytesInUtf8Character(const uint8 *in)
	{
		unsigned int c = *in;
		switch(c>>4)
		{
			case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
			default:
				// 0xxx xxxx
				return 1;
			case 12: case 13:
				// 110xxxxx   10xxxxxx
				return 2;
			case 14:
				// 1110xxxx  10xxxxxx  10xxxxxx
				return 3;
			case 15:
				// 11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
				return 4;
		}
	}


	ArrayObject* RegExpObject::split(Stringp subject, uint32 limit)
	{
		AvmCore *core = this->core();
		ArrayObject *out = toplevel()->arrayClass->newArray();
		UsesUTF8String utf8Subject(subject);

		int startIndex=0;
		int matchIndex;
		int matchLen;
		ArrayObject* matchArray;
		unsigned n=0;
		bool isEmptyRE = m_source->length() == 0;
		while ((matchArray = exec(subject,
								  utf8Subject,
								  startIndex,
								  matchIndex,
								  matchLen)) != NULL)
		{
			// [cn 11/22/04] when match is made, but is length 0 we've matched the empty
			//  position between characters.  Although we've "matched", its zero length so just break out.
			if (matchLen == 0 ) {
				matchLen = 0;
				matchIndex = startIndex+numBytesInUtf8Character((uint8*)(utf8Subject->c_str())+startIndex); // +1char  will advance startIndex, extract just one char
				if( !isEmptyRE )
				{
					// don't break if we're processing an empty regex - then we want to split the string into each character
					// so we want the loop to continue
					break;
				}
			}

			//[ed 8/10/04] don't go past end of string. not sure why pcre doesn't return null
			//for a match starting past the end.
			//[cn 12/3/04] because a regular expression which matches an empty position (space between characters)
			//  will match the empty position just past the last character.  This test is correct, though
			//  it needs to come before we do any setProperties to avoid a bogus xtra result.
			if (matchIndex+matchLen > utf8Subject->length()) {
				startIndex = matchIndex+matchLen;
				break;
			} else {
				out->setUintProperty(n++,
									 (core->newString(utf8Subject->c_str()+startIndex,
													  matchIndex-startIndex))->atom());
				if (n >= limit)
					break;
				for (uint32 j=1; j<matchArray->getLength(); j++) {
					out->setUintProperty(n++, matchArray->getUintProperty(j));
					if (n >= limit)
						break;
				}
				// Advance past this match
				startIndex = matchIndex+matchLen;
			}
		}

		// If we found no match, or we did find a match and are still under limit, and there is a remainder left, add it
		if ((unsigned)n < limit && startIndex <= utf8Subject->length()) {
			out->setUintProperty(n++,
								 (core->newString(utf8Subject->c_str()+startIndex,
												  utf8Subject->length()-startIndex))->atom());
		}

		return out;
	}

	int RegExpObject::Utf16ToUtf8Index(Stringp utf16String,
									   UTF8String *utf8String,
									   int utf16Index)
	{
		int utf16Length = utf16String->length();
		int utf8Length  = utf8String->length();

		// If the UTF-16 and UTF-8 strings contain the same number of
		// characters, the string is plain ASCII and no conversion is needed.
		if (utf16Length == utf8Length)
		{
			return utf16Index;
		}

		// If the UTF-16 index is out of range, do nothing.
		if (utf16Index < 0 || utf16Index > utf16Length)
		{
			return utf16Index;
		}

		return UnicodeUtils::Utf16ToUtf8(utf16String->c_str(),
										 utf16Index,
										 NULL,
										 0);
	}

	Atom RegExpObject::execSimple(Stringp subject)
	{
		if (!subject)
		{
			subject = core()->knull;
		}
		UsesUTF8String utf8Subject(subject);
		ArrayObject *result = exec(subject, utf8Subject);
		return result ? result->atom() : nullStringAtom;
	}

	ArrayObject* RegExpObject::exec(Stringp subject, UTF8String *utf8Subject)
	{
		AvmAssert(subject != NULL);
		AvmAssert(utf8Subject != NULL);

		int matchIndex = 0, matchLen = 0;
		int startIndex = (isGlobal() ? Utf16ToUtf8Index(subject,
													utf8Subject,
													m_lastIndex) : 0);

		ArrayObject* result = exec(subject,
								   utf8Subject,
								   startIndex,
								   matchIndex,
								   matchLen);
		if (isGlobal())
		{
			m_lastIndex = Utf8ToUtf16Index(subject,
										 utf8Subject,
										 matchIndex+matchLen);
		}

		return result;
	}

    ArrayObject* RegExpObject::exec(Stringp subject,
									UTF8String *utf8Subject,
									int startIndex,
									int& matchIndex,
									int& matchLen)
	{
		AvmAssert(subject != NULL);
		AvmAssert(utf8Subject != NULL);

		int ovector[OVECTOR_SIZE];
		int results;
		int subjectLength = utf8Subject->length();

		if( startIndex < 0 ||
			startIndex > subjectLength ||
			(results = pcre_exec((pcre*)m_pcreInst,
								NULL,
								utf8Subject->c_str(),
								subjectLength,
								startIndex,
								0,
								ovector,
								OVECTOR_SIZE)) < 0)
		{
			matchIndex = 0;
			matchLen = 0;
			return NULL;
		}

		AvmCore *core = this->core();
		ArrayObject *a = toplevel()->arrayClass->newArray(results);

		a->setAtomProperty(toplevel()->regexpClass()->kindex,
			   core->intToAtom(Utf8ToUtf16Index(subject, utf8Subject, ovector[0])));
		a->setAtomProperty(toplevel()->regexpClass()->kinput,
			   subject->atom());
		a->setLength(results);

		// set array slots
		for (int i=0; i<results; i++) {
			if (ovector[i*2] > -1) {
				int length = ovector[i*2 + 1] - ovector[i*2];
				Atom match = stringFromUTF8(utf8Subject->c_str()+ovector[i*2], length);
				a->setUintProperty(i, match);
			} else {
				a->setUintProperty(i, undefinedAtom);
			}
		}

		// handle named groups
		if (m_hasNamedGroups)
		{
			int entrySize;
			pcre_fullinfo((pcre*)m_pcreInst, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrySize);

			int nameCount;
			pcre_fullinfo((pcre*)m_pcreInst, NULL, PCRE_INFO_NAMECOUNT, &nameCount);

			// this space is freed when (pcre*)m_pcreInst is freed
			char *nameTable;
			pcre_fullinfo((pcre*)m_pcreInst, NULL, PCRE_INFO_NAMETABLE, &nameTable);

			/* nameTable is a series of fixed length entries (entrySize)
			   the first two bytes are the index into the ovector and the result
			   is a null terminated string (the subgroup name) */
			for (int i = 0; i < nameCount; i++)
			{
				int nameIndex, length;
				nameIndex = (nameTable[0] << 8) + nameTable[1];
				length = ovector[nameIndex * 2 + 1] - ovector[ nameIndex * 2 ];

				Atom name = stringFromUTF8((char*)(nameTable+2), strlen(nameTable+2));
				name = core->internString(name)->atom();

				Atom value = stringFromUTF8(utf8Subject->c_str()+ovector[nameIndex*2], length);

				a->setAtomProperty(name, value);

				nameTable += entrySize;
			}
		}

		matchIndex = ovector[0];
		matchLen = ovector[1]-ovector[0];

		return a;
	}

	ArrayObject* RegExpObject::match(Stringp subject)
	{
		UsesUTF8String utf8Subject(subject);
		if (!isGlobal())
		{
			return exec(subject, utf8Subject);
		}
		else
		{
			ArrayObject *a = toplevel()->arrayClass->newArray();

			int oldLastIndex = m_lastIndex;
			m_lastIndex = 0;

			int n = 0;

			ArrayObject* matchArray;
			while (true)
			{
				int last = m_lastIndex;
				int matchIndex = 0, matchLen = 0;
				int startIndex = Utf16ToUtf8Index(subject, utf8Subject,	m_lastIndex);

				matchArray = exec(subject,
								  utf8Subject,
								  startIndex,
								  matchIndex,
								  matchLen);
				m_lastIndex = Utf8ToUtf16Index(subject,
											   utf8Subject,
											   matchIndex+matchLen);

				if ((matchArray == NULL) || (last == m_lastIndex))
					break;
				a->setUintProperty(n++, matchArray->getUintProperty(0));
			}

			if (m_lastIndex == oldLastIndex)
			{
				m_lastIndex++;
			}

			return a;
		}
	}

#define NUM_MATCHES 100

	Atom RegExpObject::replace(Stringp subject,
							   Stringp replacement)
	{
		UsesUTF8String utf8Subject(subject);
		UsesUTF8String utf8Replacement(replacement);

		int ovector[OVECTOR_SIZE];
		int subjectLength = utf8Subject->length();
		int lastIndex=0;

		StringBuffer resultBuffer(core());

		const char *src = utf8Subject->c_str();

		// get start/end index of all matches
		int matchCount;
		while (lastIndex <= subjectLength &&
			   (matchCount = pcre_exec((pcre*)m_pcreInst, NULL, src,
						 subjectLength, lastIndex, 0, ovector, OVECTOR_SIZE)) > 0)
		{
			int captureCount = matchCount-1;

			int matchIndex = ovector[0];
			int matchLen   = ovector[1]-ovector[0];

			// copy in stuff leading up to match
			resultBuffer.write(src+lastIndex, matchIndex-lastIndex);

			const char *ptr = utf8Replacement->c_str();
			while (*ptr) {
				if (*ptr == '$') {
					switch (*(ptr+1)) {
					case '$':
						resultBuffer << '$';
						ptr += 2;
						break;
					case '&':
						resultBuffer.write(src+matchIndex, matchLen);
						ptr += 2;
						break;
					case '`':
						resultBuffer.write(src, matchIndex);
						ptr += 2;
						break;
					case '\'':
						resultBuffer << src+ovector[1];
						ptr += 2;
						break;
					case '0':
					case '1':
					case '2':
					case '3':
					case '4':
					case '5':
					case '6':
					case '7':
					case '8':
					case '9':
						{
							int i;
							if (*(ptr+2) >= '0' && *(ptr+2) <= '9') {
								int j = 10*(ptr[1]-'0')+(ptr[2]-'0');
								if (j > captureCount) {
									// Gobbling up two digits would overflow the
									// capture count, so just use the one digit.
									i = ptr[1]-'0';
								} else {
									i = j;
								}
							} else {
								i = ptr[1]-'0';
							}
							if (i >= 1 && i <= captureCount) {
								resultBuffer.write(src+ovector[i*2],
												   ovector[i*2+1]-ovector[i*2]);
								ptr += (i >= 10) ? 3 : 2;
							} else {
								resultBuffer << *ptr++;
							}
						}
						break;
					default:
						resultBuffer << *ptr++;
						break;
					}
				} else {
					resultBuffer << *ptr++;
				}
			}

			int newLastIndex = ovector[0] + (ovector[1] - ovector[0]);

			// prevents infinite looping in certain cases
			fixReplaceLastIndex(src,
								subjectLength,
								lastIndex,
								newLastIndex,
								resultBuffer);

			lastIndex = newLastIndex;

			if (!isGlobal())
			{
				break;
			}
		}

		// copy in stuff after last match
		if (lastIndex < subjectLength)
		{
			resultBuffer.write(src+lastIndex, subjectLength-lastIndex);
		}

		return stringFromUTF8(resultBuffer.c_str(),
							  resultBuffer.length());
	}

	Atom RegExpObject::replace(Stringp subject,
							   ScriptObject* replaceFunction)
	{
		UsesUTF8String utf8Subject(subject);

		int ovector[OVECTOR_SIZE];
		int subjectLength = utf8Subject->length();
		int lastIndex=0;

		StringBuffer resultBuffer(core());

		const char *src = utf8Subject->c_str();

		// get start/end index of all matches
		int matchCount;
		while (lastIndex < subjectLength &&
			   (matchCount = pcre_exec((pcre*)m_pcreInst, NULL, src,
						 subjectLength, lastIndex, 0, ovector, OVECTOR_SIZE)) > 0)
		{
			int captureCount = matchCount-1;

			int matchIndex = ovector[0];
			int matchLen   = ovector[1]-ovector[0];

			// copy in stuff leading up to match
			resultBuffer.write(src+lastIndex, matchIndex-lastIndex);

			// call the replace function
			Atom argv[NUM_MATCHES+4];
			int argc = captureCount+3;

			argv[0] = undefinedAtom;

			// ECMA 15.5.4.11: Argument 1 is the substring that matched.
			argv[1] = core()->newString(src+matchIndex, matchLen)->atom();

			// ECMA 15.5.4.11: The next m arguments are all of the captures in the
			// MatchResult
			for (int i=1; i<=captureCount; i++)
			{
				argv[i+1] = core()->newString(src+ovector[i*2],
											  ovector[i*2+1]-ovector[i*2])->atom();
			}

			// ECMA 15.5.4.11: Argument m+2 is the offset within string
			// where the match occurred
			argv[captureCount+2] = core()->uintToAtom(matchIndex);

			// ECMA 15.5.4.11: Argument m+3 is string
			argv[captureCount+3] = subject->atom();

			resultBuffer << core()->string(toplevel()->op_call(replaceFunction->atom(),
															   argc, argv));

			int newLastIndex = ovector[0] + (ovector[1] - ovector[0]);

			// prevents infinite looping in certain cases
			fixReplaceLastIndex(src,
								subjectLength,
								lastIndex,
								newLastIndex,
								resultBuffer);

			lastIndex = newLastIndex;

			if (!isGlobal())
				break;
		}

		// copy in stuff after last match
		if (lastIndex < subjectLength)
		{
			resultBuffer.write(src+lastIndex, subjectLength-lastIndex);
		}

		return stringFromUTF8(resultBuffer.c_str(),
							  resultBuffer.length());
	}

	void RegExpObject::fixReplaceLastIndex(const char *src,
										   int subjectLength,
										   int lastIndex,
										   int& newLastIndex,
										   StringBuffer& resultBuffer)
	{
		if (lastIndex == newLastIndex && isGlobal())
		{
			// Advance one character
			if (lastIndex < subjectLength)
			{
				uint32 ch;
				int n = UnicodeUtils::Utf8ToUcs4((const uint8*)src+lastIndex, subjectLength-lastIndex, &ch);
				if (n <= 0)
				{
					// Invalid UTF8 sequence, advance one byte
					n = 1;
				}
				resultBuffer.write(src+lastIndex, n);
				newLastIndex += n;
			}
			else
			{
				newLastIndex++;
			}
		}
	}

	//
	// Accessors
	//
}