diff --git a/mozilla/ef/Runtime/System/JavaString.cpp b/mozilla/ef/Runtime/System/JavaString.cpp index 85f2fa3e510..8178461e2de 100644 --- a/mozilla/ef/Runtime/System/JavaString.cpp +++ b/mozilla/ef/Runtime/System/JavaString.cpp @@ -26,23 +26,73 @@ static inline JavaArray *newCharArray(Uint32 length) return (new (mem) JavaArray(Array::obtain(tkChar), length)); } +/* Count the number of bytes it would take to encode the given Unicode + * string using UTF-8. Add in the extra byte for the terminating NUL. + */ +static int +countUtf8Chars(const uint16 *ucs2, int ucs2len) +{ + int utf8len = 1; // Need one character for terminating NUL -/* Return the UTF representation of this string. This routine allocates + for (int i = ucs2len-1; i >= 0; i--) { + uint16 u = ucs2[i]; + if (u < 0x80) + utf8len += 1; + else if (u < 0x800) + utf8len += 2; + else + utf8len += 3; + } + return utf8len; +} + +/* Convert a Unicode (UCS-2) string to UTF-8 encoding. The length of + * the destination string, in bytes, is given by the utf8len argument. + * A NUL character is appended to the destination string, if possible. + * Returns: the actual length of the resulting string, in bytes. + */ +static int +convertUnicodeToUtf8(char *utf8, const uint16* ucs2, int utf8len) +{ + char* start_utf8 = utf8; + char* lastchar = utf8 + utf8len - 1; + + while (utf8 < lastchar) { + uint16 u = *ucs2++; + if (u < 0x80) { + *utf8++ = (char)u; + } else if (u < 0x800) { + if (utf8 >= (lastchar - 1)) + break; + *utf8++ = 0xc0 | ((u >> 6) & 0x1f); + *utf8++ = 0x80 | (u & 0x3f); + } else { + if (utf8 >= (lastchar - 2)) + break; + *utf8++ = 0xe0 | ((u >> 12) & 0x0f); + *utf8++ = 0x80 | ((u >> 6) & 0x3f); + *utf8++ = 0x80 | (u & 0x3f); + } + } + if (utf8 <= lastchar) + *utf8 = 0; + + return utf8 - start_utf8; +} + +/* Return the UTF8 representation of this string. This routine allocates * enough memory for the conversion; this memory can be freed using * JavaString::freeUtf() */ char *JavaString::convertUtf() { - /* XXX Fixme For now, we just copy the string over byte by byte... */ - const int16 *chars = getStr(); - char *copy = new char[count+1]; + const uint16 *chars = getStr(); + int utf8len = countUtf8Chars(chars, count); + char *utf8 = new char[utf8len]; - int32 i; - for (i = 0; i < count; i++) - copy[i] = (char) chars[i]; + convertUnicodeToUtf8(utf8, chars, utf8len); - copy[i] = 0; - return copy; + return utf8; } void JavaString::freeUtf(char *str) @@ -50,29 +100,96 @@ void JavaString::freeUtf(char *str) delete [] str; } +/* Count the number of Unicode characters in a NUL-terminated + * UTF8 string. Don't count the final NUL character. + */ +static int +countUnicodeChars(const char *utf8) +{ + signed char c; + int length = 0; + + // Unicode characters are encoded as 1, 2, or 3 bytes in a UCS-2 string + while (c = *utf8) { + length++; + + if (c >= 0) { + // Characters in the range of 0..0x7f are encoded using one byte + // b0xxxxxxx + utf8++; + } else if ((c & 0xe0) == 0xc0) { + // Characters in the range 0x80..0x7ff are encoded using two bytes + // b110xxxxx b10yyyyyy + utf8 += 2; + } else { + // Characters in the range 0x800..0xffff are encoded using three bytes + // b1110xxxx b10yyyyyy b10zzzzzz + PR_ASSERT((c & 0xf0) == 0xe0); + utf8 += 3; + } + } + return length; +} + +/* Convert a UTF-8 encoded string to Unicode (UCS-2) representation. The + * length of the destination string, in 16-bit characters, is given by the + * ucs2 argument. The result is *not* NUL-terminated. + * Returns: the actual length of the resulting string, in characters. + */ +static int +convertUTF8ToUnicode(uint16 *ucs2, const char *utf8, int ucs2len) +{ + signed char c; + int length = 0; + + // Unicode characters are encoded as 1, 2, or 3 bytes in a UCS-2 string + while ((c = *utf8) != 0) { + length++; + if (length > ucs2len) + return ucs2len; + + if (c >= 0) { + // Characters in the range of 0..0x7f are encoded using one byte + // b0xxxxxxx + *ucs2 = c; + utf8++; + } else if ((c & 0xe0) == 0xc0) { + // Characters in the range 0x80..0x7ff are encoded using two bytes + // b110xxxxx b10yyyyyy + *ucs2 = ((c & 0x1f) << 6) | (utf8[1] & 0x3f); + utf8 += 2; + } else { + // Characters in the range 0x800..0xffff are encoded using three bytes + // b1110xxxx b10yyyyyy b10zzzzzz + PR_ASSERT((c & 0xf0) == 0xe0); + *ucs2 = ((c & 0x0f) << 12) | ((utf8[1] & 0x3f) << 6) | (utf8[2] & 0x3f); + utf8 += 3; + } + ucs2++; + } + return length; +} + + /* Create a new JavaString from a char array that represents the string in UTF-8 * format. */ JavaString::JavaString(const char *str) : JavaObject(*strType) { - count = PL_strlen(str); + count = countUnicodeChars(str); offset = 0; - /* Let's keep the string zero-terminated anyway */ - value = (JavaArray *) newCharArray(count+1); - int16 *chars = const_cast(getStr()); + value = (JavaArray *) newCharArray(count); + uint16 *chars = const_cast(getStr()); - for (int32 i = 0; i < count; i++) - chars[i] = str[i]; - - chars[count] = 0; + convertUTF8ToUnicode(chars, str, count); } /* print a textual representation of this string */ void JavaString::dump() { - const int16 *chars = getStr(); + const uint16 *chars = getStr(); for (int16 i = 0; i < count; i++) putchar(chars[i]);