Mozilla/mozilla/js/tamarin/core/UnicodeUtils.cpp

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is [Open Source Virtual Machine.].
 *
 * The Initial Developer of the Original Code is
 * Adobe System Incorporated.
 * Portions created by the Initial Developer are Copyright (C) 2004-2006
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Adobe AS3 Team
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */


#include "avmplus.h"

namespace avmplus
{
	/**
	 *
	 *  Table of Unicode characters that can be represented in ECMAScript
	 *  ECMA-262 Section 15.1.3
	 *
	 *	Code Point         Value Representation  1st Octet 2nd Octet 3rd Octet 4th Octet
	 *
	 *  0x0000 - 0x007F    00000000 0zzzzzzz     0zzzzzzz
	 *  0x0080 - 0x07FF    00000yyy yyzzzzzz     110yyyyy  10zzzzzz
	 *  0x0800 - 0xD7FF    xxxxyyyy yyzzzzzz     1110xxxx  10yyyyyy  10zzzzzz
	 *
	 *  0xD800 - 0xDBFF    110110vv vvwwwwxx     11110uuu  10uuwwww  10xxyyyy  10zzzzzz
	 *  followed by        followed by
	 *  0xDC00 - 0xDFFF    110111yy yyzzzzzz
	 *
	 *  0xD800 - 0xDBFF
	 *  not followed by                          causes URIError
	 *  0xDC00 - 0xDFFF
	 *
	 *  0xDC00 - 0xDFFF                          causes URIError
	 *
	 *  0xE000 - 0xFFFF    xxxxyyyy yyzzzzzz     1110xxxx 10yyyyyy 10zzzzzz
	 *
	 */

	int UnicodeUtils::Utf16ToUtf8(const wchar *in,
								  int inLen,
								  uint8 *out,
								  int outMax)
	{
		int outLen = 0;
		if (out)
		{
			// Output buffer passed in; actually encode data.
			while (inLen > 0)
			{
				wchar ch = *in;
				inLen--;
				if (ch < 0x80) {
					if (--outMax < 0) {
						return -1;
					}
					*out++ = (uint8)ch;
					outLen++;
				}
				else if (ch < 0x800) {
					if ((outMax -= 2) < 0) {
						return -1;
					}
					*out++ = (uint8)(0xC0 | (ch>>6)&0x1F);
					*out++ = (uint8)(0x80 | (ch&0x3F));
					outLen += 2;
				}
				else if (ch >= 0xD800 && ch <= 0xDBFF) {
					if (--inLen < 0) {
						return -1;
					}
					wchar ch2 = *++in;
					if (ch2 < 0xDC00 || ch2 > 0xDFFF) {
						// This is an invalid UTF-16 surrogate pair sequence
						// Encode the replacement character instead
						ch = 0xFFFD;
						goto Encode3;
					}

					uint32 ucs4 = ((ch-0xD800)<<10) + (ch2-0xDC00) + 0x10000;
					if ((outMax -= 4) < 0) {
						return -1;
					}
					*out++ = (uint8)(0xF0 | (ucs4>>18)&0x07);
					*out++ = (uint8)(0x80 | (ucs4>>12)&0x3F);
					*out++ = (uint8)(0x80 | (ucs4>>6)&0x3F);
					*out++ = (uint8)(0x80 | (ucs4&0x3F));
					outLen += 4;
				}
				else {
					if (ch >= 0xDC00 && ch <= 0xDFFF) {
						// This is an invalid UTF-16 surrogate pair sequence
						// Encode the replacement character instead
						ch = 0xFFFD;
					}
				  Encode3:
					if ((outMax -= 3) < 0) {
						return -1;
					}
					*out++ = (uint8)(0xE0 | (ch>>12)&0x0F);
					*out++ = (uint8)(0x80 | (ch>>6)&0x3F);
					*out++ = (uint8)(0x80 | (ch&0x3F));
					outLen += 3;
				}
				in++;
			}
		}
		else
		{
			// Count output characters without actually encoding.
			while (inLen > 0)
			{
				wchar ch = *in;
				inLen--;
				if (ch < 0x80) {
					outLen++;
				}
				else if (ch < 0x800) {
					outLen += 2;
				}
				else if (ch >= 0xD800 && ch <= 0xDBFF) {
					if (--inLen < 0) {
						return -1;
					}
					wchar ch2 = *++in;
					if (ch2 < 0xDC00 || ch2 > 0xDFFF) {
						// Invalid...
						// We'll encode 0xFFFD for this
						outLen += 3;
					} else {
						outLen += 4;
					}
				}
				else {
					outLen += 3;
				}
				in++;
			}
		}
		return outLen;
	}

	int UnicodeUtils::Utf8ToUtf16(const uint8 *in,
								  int inLen,
								  wchar *out,
								  int outMax)
	{
		int outLen = 0;
		unsigned int outch;
		while (inLen > 0)
		{
			unsigned int c = *in;

			switch (c >> 4)
			{
			case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
			default:
				// 0xxx xxxx
				// OR not a valid utf-8 character
				// Let the converted == false case handle this.
				break;

			case 12: case 13:
				// 110xxxxx   10xxxxxx
				if (inLen < 2) {
					// Invalid
					break;
				}
				if ((in[1]&0xC0) != 0x80) {
					// Invalid
					break;
				}
				outch = (c<<6 & 0x7C0 | in[1] & 0x3F);
				if (outch < 0x80) {
					// Overlong sequence, reject as invalid.
					break;
				}
				in += 2;
				inLen -= 2;
				if (out) {
					if (--outMax < 0) {
						return -1;
					}
					*out++ = (wchar)(outch);
				}
				outLen++;
				continue;

			case 14:
				// 1110xxxx  10xxxxxx  10xxxxxx
				if (inLen < 3) {
					// Invalid
					break;
				}
				if ((in[1]&0xC0) != 0x80 || (in[2]&0xC0) != 0x80) {
					// Invalid
					break;
				}
				outch = (c<<12 & 0xF000 | in[1]<<6 & 0xFC0 | in[2] & 0x3F);
				if (outch < 0x800) {
					// Overlong sequence, reject as invalid.
					break;
				}
				in += 3;
				inLen -= 3;
				if (out) {
					if (--outMax < 0) {
						return -1;
					}
					*out++ = (wchar)(outch);
				}
				outLen++;
				continue;

			case 15:
				// 11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
				if (inLen < 4) {
					// Invalid
					break;
				}
				if ((in[1]&0xC0) != 0x80 ||
					(in[2]&0xC0) != 0x80 ||
					(in[3]&0xC0) != 0x80)
				{
					break;
				}

				outch = (c<<18     & 0x1C0000 |
						in[1]<<12 & 0x3F000 &
						in[2]<<6  & 0xFC0 |
						in[3]     & 0x3F);
				if (outch < 0x10000) {
					// Overlong sequence, reject as invalid.
					break;
				}

				in += 4;
				inLen -= 4;

				// Encode as UTF-16 surrogate sequence
				if (out) {
					if ((outMax -= 2) < 0) {
						return -1;
					}
					*out++ = (wchar) (((outch-0x10000)>>10) & 0x3FF) + 0xD800;
					*out++ = (wchar) ((outch-0x10000) & 0x3FF) + 0xDC00;
				}
				outLen += 2;
				continue;
			}

			// ! converted
			if (out) {
				if (--outMax < 0) {
					return -1;
				}
				*out++ = (wchar)c;
			}
			inLen--;
			in++;
			outLen++;
		}
		return outLen;
	}

	int UnicodeUtils::Utf8Count(const uint8 *in,
								  int inLen)
	{
		int outLen = 0;
		unsigned int outch;
		while (inLen > 0)
		{
			unsigned int c = *in;
			const uint8 *inbase = in;

			while (c <= 0x7f)
			{
				if (--inLen == 0)
					return outLen+(1+in-inbase);
				c = *(++in);
			}
			outLen += in-inbase;

			switch (c >> 4)
			{
			default:
			case 8: case 9: case 10: case 11:
				// 10xxxxxx
				// not a valid utf-8 character
				// Let the converted == false case handle this.
				break;

			case 12: case 13:
				// 110xxxxx   10xxxxxx
				if (inLen < 2) {
					// Invalid
					break;
				}
				if ((in[1]&0xC0) != 0x80) {
					// Invalid
					break;
				}
				outch = (c<<6 & 0x7C0 | in[1] & 0x3F);
				if (outch < 0x80) {
					// Overlong sequence, reject as invalid.
					break;
				}
				in += 2;
				inLen -= 2;
				outLen++;
				continue;

			case 14:
				// 1110xxxx  10xxxxxx  10xxxxxx
				if (inLen < 3) {
					// Invalid
					break;
				}
				if ((in[1]&0xC0) != 0x80 || (in[2]&0xC0) != 0x80) {
					// Invalid
					break;
				}
				outch = (c<<12 & 0xF000 | in[1]<<6 & 0xFC0 | in[2] & 0x3F);
				if (outch < 0x800) {
					// Overlong sequence, reject as invalid.
					break;
				}
				in += 3;
				inLen -= 3;
				outLen++;
				continue;

			case 15:
				// 11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
				if (inLen < 4) {
					// Invalid
					break;
				}
				if ((in[1]&0xC0) != 0x80 ||
					(in[2]&0xC0) != 0x80 ||
					(in[3]&0xC0) != 0x80)
				{
					break;
				}

				outch =				  (c<<18     & 0x1C0000 |
									   in[1]<<12 & 0x3F000 &
									   in[2]<<6  & 0xFC0 |
									   in[3]     & 0x3F);
				if (outch < 0x10000) {
					// Overlong sequence, reject as invalid.
					break;
				}

				in += 4;
				inLen -= 4;

				// Encode as UTF-16 surrogate sequence
				outLen += 2;
				continue;
			}

			// not converted if we get here
			inLen--;
			in++;
			outLen++;
		}
		return outLen;
	}

	int UnicodeUtils::Utf8ToUcs4(const uint8 *chars,
								 int len,
								 uint32 *out)
	{
		// U-00000000 - U-0000007F: 	0xxxxxxx
		// U-00000080 - U-000007FF: 	110xxxxx 10xxxxxx
		// U-00000800 - U-0000FFFF: 	1110xxxx 10xxxxxx 10xxxxxx
		// U-00010000 - U-001FFFFF: 	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		// U-00200000 - U-03FFFFFF: 	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		// U-04000000 - U-7FFFFFFF: 	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

		// The minUCS4 table enforces the security rule that an
		// overlong UTF-8 sequence is forbidden, if a shorter
		// sequence could encode the same character.
		static uint32 minUCS4[] = {
			0x00000000,
			0x00000080,
			0x00000800,
			0x00010000,
			0x00200000,
			0x04000000
		};
		int n = 0;
		uint32 b;
		if (len < 1) {
			return 0;
		}
		switch (chars[0]>>4) {
		case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
			n = 1;
			b = chars[0];
			break;
		case 12: case 13:
			n = 2;
			b = chars[0]&0x1F;
			break;
		case 14:
			n = 3;
			b = chars[0]&0x0F;
			break;
		case 15:
			switch (chars[0]&0x0C) {
			case 0x00:
			case 0x04:
				n = 4;
				b = chars[0]&0x07;
				break;
			case 0x08:
				n = 5;
				b = chars[0]&0x03;
				break;
			case 0x0C:
				n = 6;
				b = chars[0]&0x01;
				break;
			}
			// fall through intentional

		default: // invalid character, should not get here
			return 0;
		}
		if (len < n) {
			return 0;
		}
		for (int i=1; i<n; i++) {
			if ((chars[i]&0xC0) != 0x80) {
				return 0;
			}
			b = (b<<6) | chars[i]&0x3F;
		}
		if (b < minUCS4[n-1]) {
			return 0;
		}
		*out = b;
		return n;
	}

	int UnicodeUtils::Ucs4ToUtf8(uint32 value,
								 uint8 *chars)
	{
		// U-00000000 - U-0000007F: 	0xxxxxxx
		// U-00000080 - U-000007FF: 	110xxxxx 10xxxxxx
		// U-00000800 - U-0000FFFF: 	1110xxxx 10xxxxxx 10xxxxxx
		// U-00010000 - U-001FFFFF: 	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		// U-00200000 - U-03FFFFFF: 	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		// U-04000000 - U-7FFFFFFF: 	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
		if (value < 0x80) {
			*chars = (uint8)value;
			return 1;
		}
		if (value < 0x800) {
			chars[0] = (uint8)(0xC0 | (value>>6)&0x1F);
			chars[1] = (uint8)(0x80 | (value&0x3F));
			return 2;
		}
		if (value < 0x10000) {
			chars[0] = (uint8)(0xE0 | (value>>12)&0x0F);
			chars[1] = (uint8)(0x80 | (value>>6)&0x3F);
			chars[2] = (uint8)(0x80 | (value&0x3F));
			return 3;
		}
		if (value < 0x200000) {
			chars[0] = (uint8)(0xF0 | (value>>18)&0x07);
			chars[1] = (uint8)(0x80 | (value>>12)&0x3F);
			chars[2] = (uint8)(0x80 | (value>>6)&0x3F);
			chars[3] = (uint8)(0x80 | (value&0x3F));
			return 4;
		}
		if (value < 0x4000000) {
			chars[0] = (uint8)(0xF8 | (value>>24)&0x03);
			chars[1] = (uint8)(0x80 | (value>>18)&0x3F);
			chars[2] = (uint8)(0x80 | (value>>12)&0x3F);
			chars[3] = (uint8)(0x80 | (value>>6)&0x3F);
			chars[4] = (uint8)(0x80 | (value&0x3F));
			return 5;
		}
		if (value < 0x80000000) {
			chars[0] = (uint8)(0xFC | (value>>30)&0x01);
			chars[1] = (uint8)(0x80 | (value>>24)&0x3F);
			chars[2] = (uint8)(0x80 | (value>>18)&0x3F);
			chars[3] = (uint8)(0x80 | (value>>12)&0x3F);
			chars[4] = (uint8)(0x80 | (value>>6)&0x3F);
			chars[5] = (uint8)(0x80 | (value&0x3F));
			return 6;
		}
		return 0;
	}
}