Mozilla/mozilla/js/tamarin/core/StringObject.h

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is [Open Source Virtual Machine.].
 *
 * The Initial Developer of the Original Code is
 * Adobe System Incorporated.
 * Portions created by the Initial Developer are Copyright (C) 2004-2006
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Adobe AS3 Team
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#ifndef __avmplus_String__
#define __avmplus_String__


namespace avmplus
{
#ifdef DEBUGGER
	class GCHashtableScriptObject;
#endif
	/**
	 * A string in UTF-8 encoding.
	 *
	 * UTF8String's are immutable and garbage collected, which makes
	 * it easy to pass them around.
	 */
	class UTF8String : public MMgc::GCObject
	{
	public:
		/**
		 * Constructs a UTF8String.  This should not be called
		 * directly; use the toUTF8String method of String.
		 */
		UTF8String(int _length) { m_length = _length; }

		/**
		 * Operator overload; returns a pointer to the
		 * null-terminated string.
		 */
		operator const char* () const { return m_buffer; }

		/**
		 * Returns a pointer to the null-terminated string.
		 */
		const char *c_str() const { return m_buffer; }

		/**
		 * Returns the length of the string in bytes,
		 * excluding the null terminator.
		 */
		int length() const { return m_length; }

		/**
		 * This is an advanced method which returns a non-const
		 * pointer to the UTF8String's internal buffer.  This
		 * can be used to mutate a string that is known to not
		 * have any other references.  Use with caution.
		 */
		char *lockBuffer() { return m_buffer; }

		/**
		 * Unlocks the buffer previously returned by lockBuffer.
		 * Currently a no-op, but may change in the future,
		 * so call after using lockBuffer.
		 */
		void unlockBuffer() {}

	private:
		int m_length;
		char m_buffer[1];
	};

	/**
	 * A string in UTF-16 encoding.  This is the basic string
	 * class used by AVM+ code.
	 */
	class String : public AvmPlusScriptableObject
	{
	public:
		String(const wchar *str, int len);					// wchar[] -> string
		String(const char *str, int utf8len, int utf16len); // utf8->string
		String(Stringp s1, Stringp s0);		// concat
		String(Stringp s, int pos, int len);// substr
		String(int len);					// preallocated empty

		~String()
		{
#ifdef MMGC_DRC
			setBuf(NULL);
			setPrefixOrOffsetOrNumber(0);
			m_length = 0;
#endif
		}

		/**
		 * Converts this string to a UTF-8 string.  Allocates
		 * a new UTF8 string object containing the result,
		 * and returns it.
		 */
		UTF8String* toUTF8String();

		/**
		 * Returns the Atom equivalent of this String.  This is
		 * done by or'ing the proper type bits into the pointer.
		 */
		Atom atom() const { return AtomConstants::kStringType | (Atom)this; }

		/**
		 * virtual version of atom():
		 */
		virtual Atom toAtom() const { return atom(); }

		/**
		 * Returns the length of the string in characters.
		 * The null terminator is not included.
		 */
		int length() const {
			return m_length & 0x7FFFFFFF;
		}

		/**
		 * Operator overload; returns a pointer to the
		 * null-terminated string.
		 */
		operator const wchar* ()
		{
			// For offset too since our string needs to be null terminated
			if (needsNormalization()) normalize();
			return getData();
		}

		/**
		 * Returns a pointer to the null-terminated string.
		 */
		const wchar* c_str()
		{
			// For offset too since our string needs to be null terminated
			if (needsNormalization()) normalize();
			return getData();
		}

		/**
		 * Returns the index'th character of the string.
		 * @param index zero-based index into the string
		 */
		wchar operator[] (int index);

		/**
		 * This is an advanced method which returns a non-const
		 * pointer to the String's internal buffer.  This
		 * can be used to mutate a string that is known to not
		 * have any other references.  Use with caution.
		 */
		wchar* lockBuffer()
		{
			// For offset too since our string needs to be null terminated
			if (needsNormalization()) normalize();
			return (wchar*) getData();
		}

		/**
		 * Unlocks the buffer previously returned by lockBuffer.
		 * Must call after using lockBuffer to mutate the buffer.
		 */
		void unlockBuffer(int newLen)
		{
			AvmAssert(!isInterned());
			m_length = newLen;
		}
		void unlockBuffer() {}

		/**
		 * Returns a new string object which is a copy of this
		 * string object, with all characters in the string
		 * converted to uppercase.
		 *
		 * Unicode character classes for uppercase and lowercase
		 * are used.  The conversion behavior is compliant with
		 * the String.toUpperCase method.
		 */
		Stringp toUpperCase();

		/**
		 * Returns a new string object which is a copy of this
		 * string object, with all characters in the string
		 * converted to lowercase.
		 *
		 * Unicode character classes for uppercase and lowercase
		 * are used.  The conversion behavior is compliant with
		 * the String.toLowerCase method.
		 */
		Stringp toLowerCase();

		/*@{*/
		/**
		 * Compare the String with toCompare.
		 * @return = 0 if the strings are identical.
		 *         < 0 if this string is less than toCompare
		 *         > 0 if this string is greater than toCompare
		 */
		int Compare(String& toCompare)
		{
			if (hasPrefix()) normalize();
			if (toCompare.hasPrefix()) toCompare.normalize();

			return String::Compare(getData() + getOffset(), length(), toCompare.getData() + toCompare.getOffset(), toCompare.length());
		}

		/*@{*/
		/**
		 * Does String contain wchar?
		 * @return = 0 if the strings are identical.
		 *         < 0 if this string is less than toCompare
		 *         > 0 if this string is greater than toCompare
		 */
		bool Contains(wchar c);

		// compare this string to (other,len)
		bool Equals(const wchar *toCompare, int len)
		{
			AvmAssert(toCompare[len]==0);
			int sLen = length();
			if (len != sLen) return false;
			if (hasPrefix()) normalize();
			return String::Compare(getData() + getOffset(), sLen, toCompare, len)==0;
		}

		// toCompare is not necessarily zero-terminated at toCompare[len]
		bool FastEquals(const wchar *toCompare, int len)
		{
			int sLen = length();
			if (len != sLen) return false;
			// This is only for intern strings which are never offset or prefix
			AvmAssert(needsNormalization() == false);
			const wchar *src = getData();

			// !! could we compare two WORDS at a time?  Our toCompare
			// string is not necessarily DWORD aligned.  (Offset strings, etc.)

			while (sLen)
			{
				sLen--;
				if (src[sLen] != toCompare[sLen])
					return false;
			}

			AvmAssert(sLen == 0);
			return true;
		}

		// compare this string to null-terminated 8bit string
		bool Equals(const char *other8)
		{
			if (hasPrefix()) normalize();
			return !Compare(getData() + getOffset(), other8, length());
		}

		/*@{*/
		/**
		 * Compares s1 and s2.
		 * @return = 0 if the strings are identical.
		 *         < 0 if s1 is less than s2
		 *         > 0 if s1 is greater than s2
		 */
		static int Compare(const wchar *s1,
						   int len1,
						   const wchar *s2,
						   int len2);
		static int Compare(const wchar *s1,
						   const char  *s2,
						   int len);
		/*@}*/

		/*@{*/
		/**
		 * Returns the length of str, in # of characters.
		 */
		static int Length(const wchar *str);
		static int Length(const char *str);
		/*@}*/

		void setInterned(AvmCore *core)
		{
			m_length |= 0x80000000;
			generateIntegerEquivalent (core);
		}

		static bool isSpace(wchar ch)
		{
			return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
		}

		bool isWhitespace();

		int isInterned() const
		{
			return m_length & 0x80000000;
		}

		// handles hex, octal, base 10 integer, float, and "Infinity"/"-Infinity"
		double toNumber()
		{
			// For offset too since convertStringToNumber expects a null terminated string
			if (needsNormalization()) normalize();
			return MathUtils::convertStringToNumber(getData() + getOffset(), length());
		}

		// native functions
		int indexOf(Stringp s, int i=0);
		int indexOfDouble(Stringp s, double i=0);
		int lastIndexOf(Stringp s, int i=0x7fffffff);
		int lastIndexOfDouble(Stringp s, double i=0x7fffffff);
		Stringp charAt(int i=0);
		Stringp charAtDouble(double i=0);
		double charCodeAt(int i); // returns NaN for out-of-bounds
		double charCodeAtDouble(double i); // returns NaN for out-of-bounds
		int localeCompare(Stringp other, Atom *argv, int argc);

		Stringp substring(int i_start, int i_end);
		Stringp substringDouble(double d_start, double d_end);

		Stringp slice(int dStart, int dEnd);
		Stringp sliceDouble(double dStart, double dEnd);

		Stringp substr(int dStart, int dEnd);
		Stringp substrDouble(double dStart, double dEnd);

		// Useful utilities used by the core code.
		static wchar wCharToUpper (wchar ch);
		static wchar wCharToLower (wchar ch);

#ifdef DEBUGGER
		uint32 size() const;
		Stringp getTypeName() const;
#endif

	private:
		int			m_length; // { interned: 1, length:31 }
		class StringBuf : public MMgc::RCObject
		{
		public:
			wchar m_buf[1];
#ifdef MMGC_DRC
			~StringBuf()
			{
				memset(m_buf, 0, MMgc::GC::Size(this)-sizeof(MMgc::RCObject));
			}
#endif
		};
		// no WB b/c manual WB is in setBuf, faster that way
		StringBuf* m_buf;

		// The low two bits control what type of value is stored in m_prefixOrOffsetOrNumber
		// 0x00 nothing is stored (rest of value is 0)
		// 0x01 the 29-bit numeric equivalent of this string is stored (same as kIntegerAtom format)
		// 0x02 a prefix string is stored
		// 0x03 a 30-bit offset is stored
		// manual WB when needed
		uintptr	m_prefixOrOffsetOrNumber;
		#define		STRINGFLAGS		0x03
		#define		NUMBERFLAG		0x01
		#define		PREFIXFLAG		0x02
		#define		OFFSETFLAG		0x03

		Stringp	getPrefix() const
		{
			if ((m_prefixOrOffsetOrNumber & STRINGFLAGS) == PREFIXFLAG)
				return Stringp(m_prefixOrOffsetOrNumber & ~STRINGFLAGS);
			else
				return 0;
		};

		uint32 getOffset() const
		{
			if ((m_prefixOrOffsetOrNumber & STRINGFLAGS) == OFFSETFLAG)
				return urshift(m_prefixOrOffsetOrNumber & ~STRINGFLAGS, 2);
			else
				return 0;
		};

		bool hasPrefix() const { return ((m_prefixOrOffsetOrNumber & STRINGFLAGS) == PREFIXFLAG); };
		bool hasOffset() const { return ((m_prefixOrOffsetOrNumber & STRINGFLAGS) == OFFSETFLAG); };

		bool needsNormalization() const { return ((m_prefixOrOffsetOrNumber & STRINGFLAGS) >= 0x2); };

		void normalize();

		// If our string is a valid positive integer that fits in a kIntegerAtom, this
		// will set our m_prefixOrOffsetOrNumber value to the int atom representation or'ed
		// with NUMBERTYPE.  This is only valid for non-prefix, non-offset interned strings.
		void generateIntegerEquivalent(AvmCore *core);

		void setPrefixOrOffsetOrNumber(uintptr value);

		static const wchar lowerCaseBase[];
		static const wchar upperCaseBase[];
		static const wchar lowerCaseConversion[];
		static const wchar upperCaseConversion[];
		static const unsigned char tolower_map[];
		static const unsigned char toupper_map[];
public:
		// This returns a kIntegerAtom Atom
		// for use in our ScriptObject HashTable implementation.  If we have a valid
		// integer equivalent, it will never be zero since kIntegerType tag != 0
		Atom getIntAtom() const
		{
			if ((m_prefixOrOffsetOrNumber & STRINGFLAGS) == NUMBERFLAG)
				return m_prefixOrOffsetOrNumber & ~STRINGFLAGS | kIntegerType;
			else
				return 0;
		};

		StringBuf* allocBuf(int numChars);
		wchar *getData() const { return m_buf->m_buf; }
		void setBuf(StringBuf *buf) { WBRC(MMgc::GC::GetGC(this), this, &m_buf, buf); }


	};

	// Compare helpers
	inline bool operator==(String& s1, String& s2)
	{
		return s1.length() == s2.length() && s1.Compare(s2) == 0;
	}
	inline bool operator!=(String& s1, String& s2)
	{
		return s1.length() != s2.length() || s1.Compare(s2) != 0;
	}
	inline bool operator<(String& s1, String& s2)
	{
		return s2.Compare(s1) < 0;
	}
	inline bool operator>(String& s1, String& s2)
	{
		return s2.Compare(s1) > 0;
	}
	inline bool operator<=(String& s1, String& s2)
	{
		return s2.Compare(s1) <= 0;
	}
	inline bool operator>=(String& s1, String& s2)
	{
		return s2.Compare(s1) >= 0;
	}
}

#endif /* __avmplus_String__ */