From 48dbeb8ec08dbfe64275fe6ed8d29d74b4112880 Mon Sep 17 00:00:00 2001 From: "bzbarsky%mit.edu" Date: Tue, 15 Nov 2005 18:17:22 +0000 Subject: [PATCH] Move some basic macros for working with UTF16 from intl to XPCOM; use them in some places including the HTML parser. Part of fix for bug 316394; the CSS parser changes are still coming. r=smontagu, r=mrbkap on parser changes, sr=dbaron git-svn-id: svn://10.0.0.236/trunk@184651 18797224-902f-48f8-a5cc-f745e15eee43 --- .../intl/unicharutil/util/nsUnicharUtils.h | 15 ++---- .../parser/htmlparser/src/nsHTMLTokens.cpp | 7 +-- mozilla/xpcom/string/public/nsCharTraits.h | 31 +++++++++++ mozilla/xpcom/string/public/nsReadableUtils.h | 3 ++ mozilla/xpcom/string/public/nsUTF8Utils.h | 52 ++++++++----------- mozilla/xpcom/string/src/nsReadableUtils.cpp | 16 ++++++ 6 files changed, 79 insertions(+), 45 deletions(-) diff --git a/mozilla/intl/unicharutil/util/nsUnicharUtils.h b/mozilla/intl/unicharutil/util/nsUnicharUtils.h index f021f1d953e..2407ed6a4b9 100644 --- a/mozilla/intl/unicharutil/util/nsUnicharUtils.h +++ b/mozilla/intl/unicharutil/util/nsUnicharUtils.h @@ -41,8 +41,13 @@ #ifndef nsAString_h___ #include "nsAString.h" #endif + #include "nsReadableUtils.h" +#ifndef nsCharTraits_h___ +#include "nsCharTraits.h" +#endif + void ToLowerCase( nsAString& ); void ToUpperCase( nsAString& ); @@ -90,16 +95,6 @@ inline PRBool IsLowerCase(PRUnichar c) { return ToUpperCase(c) != c; } -#define IS_HIGH_SURROGATE(u) ((PRUnichar)(u) >= (PRUnichar)0xd800 && (PRUnichar)(u) <= (PRUnichar)0xdbff) -#define IS_LOW_SURROGATE(u) ((PRUnichar)(u) >= (PRUnichar)0xdc00 && (PRUnichar)(u) <= (PRUnichar)0xdfff) - -#define SURROGATE_TO_UCS4(h, l) ((((PRUint32)(h)-(PRUint32)0xd800) << 10) + \ - (PRUint32)(l) - (PRUint32)(0xdc00) + 0x10000) - -#define H_SURROGATE(s) ((PRUnichar)(((PRUint32)s - (PRUint32)0x10000) >> 10) + (PRUnichar)0xd800) -#define L_SURROGATE(s) ((PRUnichar)(((PRUint32)s - (PRUint32)0x10000) & 0x3ff) + (PRUnichar)0xdc00) -#define IS_IN_BMP(ucs) ((PRUint32)ucs < 0x10000) - /* (0x3131u <= (u) && (u) <= 0x318eu) => Hangul Compatibility Jamo */ /* (0xac00u <= (u) && (u) <= 0xd7a3u) => Hangul Syllables */ #define IS_CJ_CHAR(u) \ diff --git a/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp b/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp index 0bebf79ce98..3c502d258a5 100644 --- a/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp +++ b/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp @@ -2363,12 +2363,7 @@ static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue) } #endif - if (IS_IN_BMP(aNCRValue)) - aString.Append(PRUnichar(aNCRValue)); - else { - aString.Append(PRUnichar(H_SURROGATE(aNCRValue))); - aString.Append(PRUnichar(L_SURROGATE(aNCRValue))); - } + AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue), aString); } /* diff --git a/mozilla/xpcom/string/public/nsCharTraits.h b/mozilla/xpcom/string/public/nsCharTraits.h index e7713c9520f..84222311a5e 100644 --- a/mozilla/xpcom/string/public/nsCharTraits.h +++ b/mozilla/xpcom/string/public/nsCharTraits.h @@ -73,6 +73,37 @@ typedef PRBool nsCharTraits_bool; #endif +// Some macros for working with PRUnichar +#define PLANE1_BASE PRUint32(0x00010000) +// High surrogates are in the range 0xD800 -- OxDBFF +#define IS_HIGH_SURROGATE(u) ((PRUnichar(u) & 0xFC00) == 0xD800) +// Low surrogates are in the range 0xDC00 -- 0xDFFF +#define IS_LOW_SURROGATE(u) ((PRUnichar(u) & 0xFC00) == 0xDC00) +// Faster than testing IS_HIGH_SURROGATE || IS_LOW_SURROGATE +#define IS_SURROGATE(u) ((PRUnichar(u) & 0xF800) == 0xD800) + +// Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF + +// N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) +// I wonder whether we could somehow assert that H is a high surrogate +// and L is a low surrogate +#define SURROGATE_TO_UCS4(h, l) (((PRUint32(h) & 0x03FF) << 10) + \ + (PRUint32(l) & 0x03FF) + PLANE1_BASE) + +// Extract surrogates from a UCS4 char +// See unicode specification 3.7 for following math. +#define H_SURROGATE(c) PRUnichar(PRUnichar((PRUint32(c) - PLANE1_BASE) >> 10) | \ + PRUnichar(0xD800)) +#define L_SURROGATE(c) PRUnichar((PRUnichar((PRUint32(c) - PLANE1_BASE) & 0x03FF) | \ + PRUnichar(0xDC00))) + +#define IS_IN_BMP(ucs) (PRUint32(ucs) < PLANE1_BASE) +#define UCS2_REPLACEMENT_CHAR PRUnichar(0xFFFD) + +#define UCS_END PRUint32(0x00110000) +#define IS_VALID_CHAR(c) ((PRUint32(c) < UCS_END) && !IS_SURROGATE(c)) +#define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR) + template struct nsCharTraits {}; NS_SPECIALIZE_TEMPLATE diff --git a/mozilla/xpcom/string/public/nsReadableUtils.h b/mozilla/xpcom/string/public/nsReadableUtils.h index 799e846566c..e41e8d29dc6 100755 --- a/mozilla/xpcom/string/public/nsReadableUtils.h +++ b/mozilla/xpcom/string/public/nsReadableUtils.h @@ -376,4 +376,7 @@ NS_COM PRInt32 CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String, const nsASingleFragmentString& aUTF16String); +NS_COM void +AppendUCS4ToUTF16(const PRUint32 aSource, nsAString& aDest); + #endif // !defined(nsReadableUtils_h___) diff --git a/mozilla/xpcom/string/public/nsUTF8Utils.h b/mozilla/xpcom/string/public/nsUTF8Utils.h index 6511389f332..a3b1db79d4c 100644 --- a/mozilla/xpcom/string/public/nsUTF8Utils.h +++ b/mozilla/xpcom/string/public/nsUTF8Utils.h @@ -38,6 +38,8 @@ #ifndef nsUTF8Utils_h_ #define nsUTF8Utils_h_ +#include "nsCharTraits.h" + class UTF8traits { public: @@ -50,9 +52,6 @@ class UTF8traits static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; } }; -#define PLANE1_BASE 0x00010000 -#define UCS2_REPLACEMENT_CHAR 0xfffd - #ifdef __GNUC__ #define NS_ALWAYS_INLINE __attribute__((always_inline)) #else @@ -285,14 +284,14 @@ public: PRUnichar c = *p++; - if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF + if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF { if (err) *err = PR_FALSE; *buffer = p; return c; } - else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF { if (*buffer == end) { @@ -304,16 +303,15 @@ public: } // D800- DBFF - High Surrogate - // N = (H- D800) *400 + 10000 + ... - PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + PRUnichar h = c; c = *p++; - if (0xDC00 == (0xFC00 & c)) + if (IS_LOW_SURROGATE(c)) { // DC00- DFFF - Low Surrogate - // N += ( L - DC00 ) - ucs4 |= (0x03FF & c); + // N = (H - D800) *400 + 10000 + (L - DC00) + PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c); if (err) *err = PR_FALSE; *buffer = p; @@ -351,13 +349,13 @@ public: PRUnichar c = *iter++; - if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF + if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF { if (err) *err = PR_FALSE; return c; } - else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF { if (iter == end) { @@ -368,16 +366,15 @@ public: } // D800- DBFF - High Surrogate - // N = (H- D800) *400 + 10000 + ... - PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + PRUnichar h = c; c = *iter++; - if (0xDC00 == (0xFC00 & c)) + if (IS_LOW_SURROGATE(c)) { // DC00- DFFF - Low Surrogate - // N += ( L - DC00 ) - ucs4 |= (0x03FF & c); + // N = (H - D800) *400 + 10000 + ( L - DC00 ) + PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c); if (err) *err = PR_FALSE; return ucs4; @@ -461,13 +458,11 @@ class ConvertUTF8toUTF16 } else if ( ucs4 >= PLANE1_BASE ) { - if ( ucs4 >= 0x00110000 ) + if ( ucs4 >= UCS_END ) *out++ = UCS2_REPLACEMENT_CHAR; else { - // surrogate, see unicode specification 3.7 for following math. - ucs4 -= PLANE1_BASE; - *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u; - *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u; + *out++ = (value_type)H_SURROGATE(ucs4); + *out++ = (value_type)L_SURROGATE(ucs4); } } else @@ -593,17 +588,16 @@ class ConvertUTF16toUTF8 *out++ = 0xC0 | (char)(c >> 6); *out++ = 0x80 | (char)(0x003F & c); } - else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF { *out++ = 0xE0 | (char)(c >> 12); *out++ = 0x80 | (char)(0x003F & (c >> 6)); *out++ = 0x80 | (char)(0x003F & c ); } - else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF { // D800- DBFF - High Surrogate - // N = (H- D800) *400 + 10000 + ... - PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + value_type h = c; ++p; if (p == end) @@ -614,11 +608,11 @@ class ConvertUTF16toUTF8 } c = *p; - if (0xDC00 == (0xFC00 & c)) + if (IS_LOW_SURROGATE(c)) { // DC00- DFFF - Low Surrogate - // N += ( L - DC00 ) - ucs4 |= (0x03FF & c); + // N = (H - D800) *400 + 10000 + ( L - DC00 ) + PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c); // 0001 0000-001F FFFF *out++ = 0xF0 | (char)(ucs4 >> 18); diff --git a/mozilla/xpcom/string/src/nsReadableUtils.cpp b/mozilla/xpcom/string/src/nsReadableUtils.cpp index c10def95e75..b10881c6520 100755 --- a/mozilla/xpcom/string/src/nsReadableUtils.cpp +++ b/mozilla/xpcom/string/src/nsReadableUtils.cpp @@ -1166,3 +1166,19 @@ CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String, return 0; } + +NS_COM +void +AppendUCS4ToUTF16(PRUint32 aSource, nsAString& aDest) + { + NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char"); + if (IS_IN_BMP(aSource)) + { + aDest.Append(PRUnichar(aSource)); + } + else + { + aDest.Append(H_SURROGATE(aSource)); + aDest.Append(L_SURROGATE(aSource)); + } + }