From 48dbeb8ec08dbfe64275fe6ed8d29d74b4112880 Mon Sep 17 00:00:00 2001
From: "bzbarsky%mit.edu"
 <bzbarsky%mit.edu@18797224-902f-48f8-a5cc-f745e15eee43>
Date: Tue, 15 Nov 2005 18:17:22 +0000
Subject: [PATCH] Move some basic macros for working with UTF16 from intl to
 XPCOM; use them in some places including the HTML parser.  Part of fix for
 bug 316394; the CSS parser changes are still coming.  r=smontagu, r=mrbkap on
 parser changes, sr=dbaron

git-svn-id: svn://10.0.0.236/trunk@184651 18797224-902f-48f8-a5cc-f745e15eee43
---
 .../intl/unicharutil/util/nsUnicharUtils.h    | 15 ++----
 .../parser/htmlparser/src/nsHTMLTokens.cpp    |  7 +--
 mozilla/xpcom/string/public/nsCharTraits.h    | 31 +++++++++++
 mozilla/xpcom/string/public/nsReadableUtils.h |  3 ++
 mozilla/xpcom/string/public/nsUTF8Utils.h     | 52 ++++++++-----------
 mozilla/xpcom/string/src/nsReadableUtils.cpp  | 16 ++++++
 6 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/mozilla/intl/unicharutil/util/nsUnicharUtils.h b/mozilla/intl/unicharutil/util/nsUnicharUtils.h
index f021f1d953e..2407ed6a4b9 100644
--- a/mozilla/intl/unicharutil/util/nsUnicharUtils.h
+++ b/mozilla/intl/unicharutil/util/nsUnicharUtils.h
@@ -41,8 +41,13 @@
 #ifndef nsAString_h___
 #include "nsAString.h"
 #endif
+
 #include "nsReadableUtils.h"
 
+#ifndef nsCharTraits_h___
+#include "nsCharTraits.h"
+#endif
+
 void ToLowerCase( nsAString& );
 void ToUpperCase( nsAString& );
 
@@ -90,16 +95,6 @@ inline PRBool IsLowerCase(PRUnichar c) {
     return ToUpperCase(c) != c;
 }
 
-#define IS_HIGH_SURROGATE(u)  ((PRUnichar)(u) >= (PRUnichar)0xd800 && (PRUnichar)(u) <= (PRUnichar)0xdbff)
-#define IS_LOW_SURROGATE(u)  ((PRUnichar)(u) >= (PRUnichar)0xdc00 && (PRUnichar)(u) <= (PRUnichar)0xdfff)
-
-#define SURROGATE_TO_UCS4(h, l)  ((((PRUint32)(h)-(PRUint32)0xd800) << 10) +  \
-                                    (PRUint32)(l) - (PRUint32)(0xdc00) + 0x10000)
-
-#define H_SURROGATE(s) ((PRUnichar)(((PRUint32)s - (PRUint32)0x10000) >> 10) + (PRUnichar)0xd800)
-#define L_SURROGATE(s) ((PRUnichar)(((PRUint32)s - (PRUint32)0x10000) & 0x3ff) + (PRUnichar)0xdc00)
-#define IS_IN_BMP(ucs) ((PRUint32)ucs < 0x10000)
-
 /* (0x3131u <= (u) && (u) <= 0x318eu) => Hangul Compatibility Jamo */
 /* (0xac00u <= (u) && (u) <= 0xd7a3u) => Hangul Syllables          */
 #define IS_CJ_CHAR(u) \
diff --git a/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp b/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp
index 0bebf79ce98..3c502d258a5 100644
--- a/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp
+++ b/mozilla/parser/htmlparser/src/nsHTMLTokens.cpp
@@ -2363,12 +2363,7 @@ static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue)
   }
 #endif
 
-  if (IS_IN_BMP(aNCRValue))
-    aString.Append(PRUnichar(aNCRValue));
-  else {
-    aString.Append(PRUnichar(H_SURROGATE(aNCRValue)));
-    aString.Append(PRUnichar(L_SURROGATE(aNCRValue)));
-  }
+  AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue), aString);
 }
 
 /*
diff --git a/mozilla/xpcom/string/public/nsCharTraits.h b/mozilla/xpcom/string/public/nsCharTraits.h
index e7713c9520f..84222311a5e 100644
--- a/mozilla/xpcom/string/public/nsCharTraits.h
+++ b/mozilla/xpcom/string/public/nsCharTraits.h
@@ -73,6 +73,37 @@
   typedef PRBool nsCharTraits_bool;
 #endif
 
+// Some macros for working with PRUnichar
+#define PLANE1_BASE          PRUint32(0x00010000)
+// High surrogates are in the range 0xD800 -- OxDBFF
+#define IS_HIGH_SURROGATE(u) ((PRUnichar(u) & 0xFC00) == 0xD800)
+// Low surrogates are in the range 0xDC00 -- 0xDFFF
+#define IS_LOW_SURROGATE(u)  ((PRUnichar(u) & 0xFC00) == 0xDC00)
+// Faster than testing IS_HIGH_SURROGATE || IS_LOW_SURROGATE
+#define IS_SURROGATE(u)      ((PRUnichar(u) & 0xF800) == 0xD800)
+
+// Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
+
+// N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
+// I wonder whether we could somehow assert that H is a high surrogate
+// and L is a low surrogate
+#define SURROGATE_TO_UCS4(h, l) (((PRUint32(h) & 0x03FF) << 10) + \
+                                 (PRUint32(l) & 0x03FF) + PLANE1_BASE)
+
+// Extract surrogates from a UCS4 char
+// See unicode specification 3.7 for following math.
+#define H_SURROGATE(c) PRUnichar(PRUnichar((PRUint32(c) - PLANE1_BASE) >> 10) | \
+                                 PRUnichar(0xD800))
+#define L_SURROGATE(c) PRUnichar((PRUnichar((PRUint32(c) - PLANE1_BASE) & 0x03FF) | \
+                                  PRUnichar(0xDC00)))
+
+#define IS_IN_BMP(ucs) (PRUint32(ucs) < PLANE1_BASE)
+#define UCS2_REPLACEMENT_CHAR PRUnichar(0xFFFD)
+
+#define UCS_END PRUint32(0x00110000)
+#define IS_VALID_CHAR(c) ((PRUint32(c) < UCS_END) && !IS_SURROGATE(c))
+#define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
+
 template <class CharT> struct nsCharTraits {};
 
 NS_SPECIALIZE_TEMPLATE
diff --git a/mozilla/xpcom/string/public/nsReadableUtils.h b/mozilla/xpcom/string/public/nsReadableUtils.h
index 799e846566c..e41e8d29dc6 100755
--- a/mozilla/xpcom/string/public/nsReadableUtils.h
+++ b/mozilla/xpcom/string/public/nsReadableUtils.h
@@ -376,4 +376,7 @@ NS_COM PRInt32
 CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
                    const nsASingleFragmentString& aUTF16String);
 
+NS_COM void
+AppendUCS4ToUTF16(const PRUint32 aSource, nsAString& aDest);
+
 #endif // !defined(nsReadableUtils_h___)
diff --git a/mozilla/xpcom/string/public/nsUTF8Utils.h b/mozilla/xpcom/string/public/nsUTF8Utils.h
index 6511389f332..a3b1db79d4c 100644
--- a/mozilla/xpcom/string/public/nsUTF8Utils.h
+++ b/mozilla/xpcom/string/public/nsUTF8Utils.h
@@ -38,6 +38,8 @@
 #ifndef nsUTF8Utils_h_
 #define nsUTF8Utils_h_
 
+#include "nsCharTraits.h"
+
 class UTF8traits
   {
     public:
@@ -50,9 +52,6 @@ class UTF8traits
       static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
   };
 
-#define PLANE1_BASE           0x00010000  
-#define UCS2_REPLACEMENT_CHAR 0xfffd     
-
 #ifdef __GNUC__
 #define NS_ALWAYS_INLINE __attribute__((always_inline))
 #else
@@ -285,14 +284,14 @@ public:
 
     PRUnichar c = *p++;
 
-    if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
+    if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
       {
         if (err)
           *err = PR_FALSE;
         *buffer = p;
         return c;
       }
-    else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+    else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
       {
         if (*buffer == end)
           {
@@ -304,16 +303,15 @@ public:
           }
 
         // D800- DBFF - High Surrogate
-        // N = (H- D800) *400 + 10000 + ...
-        PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+        PRUnichar h = c;
 
         c = *p++;
 
-        if (0xDC00 == (0xFC00 & c))
+        if (IS_LOW_SURROGATE(c))
           {
             // DC00- DFFF - Low Surrogate
-            // N += ( L - DC00 )
-            ucs4 |= (0x03FF & c);
+            // N = (H - D800) *400 + 10000 + (L - DC00)
+            PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
             if (err)
               *err = PR_FALSE;
             *buffer = p;
@@ -351,13 +349,13 @@ public:
 
     PRUnichar c = *iter++;
 
-    if (0xD800 != (0xF800 & c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
+    if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
       {
         if (err)
           *err = PR_FALSE;
         return c;
       }
-    else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+    else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
       {
         if (iter == end)
           {
@@ -368,16 +366,15 @@ public:
           }
 
         // D800- DBFF - High Surrogate
-        // N = (H- D800) *400 + 10000 + ...
-        PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+        PRUnichar h = c;
 
         c = *iter++;
 
-        if (0xDC00 == (0xFC00 & c))
+        if (IS_LOW_SURROGATE(c))
           {
             // DC00- DFFF - Low Surrogate
-            // N += ( L - DC00 )
-            ucs4 |= (0x03FF & c);
+            // N = (H - D800) *400 + 10000 + ( L - DC00 )
+            PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
             if (err)
               *err = PR_FALSE;
             return ucs4;
@@ -461,13 +458,11 @@ class ConvertUTF8toUTF16
               }
             else if ( ucs4 >= PLANE1_BASE )
               {
-                if ( ucs4 >= 0x00110000 )
+                if ( ucs4 >= UCS_END )
                   *out++ = UCS2_REPLACEMENT_CHAR;
                 else {
-                  // surrogate, see unicode specification 3.7 for following math.
-                  ucs4 -= PLANE1_BASE;
-                  *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
-                  *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
+                  *out++ = (value_type)H_SURROGATE(ucs4);
+                  *out++ = (value_type)L_SURROGATE(ucs4);
                 }
               }
             else
@@ -593,17 +588,16 @@ class ConvertUTF16toUTF8
                 *out++ = 0xC0 | (char)(c >> 6);
                 *out++ = 0x80 | (char)(0x003F & c);
               }
-            else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+            else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
               {
                 *out++ = 0xE0 | (char)(c >> 12);
                 *out++ = 0x80 | (char)(0x003F & (c >> 6));
                 *out++ = 0x80 | (char)(0x003F & c );
               }
-            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+            else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
               {
                 // D800- DBFF - High Surrogate
-                // N = (H- D800) *400 + 10000 + ...
-                PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+                value_type h = c;
 
                 ++p;
                 if (p == end)
@@ -614,11 +608,11 @@ class ConvertUTF16toUTF8
                   }
                 c = *p;
 
-                if (0xDC00 == (0xFC00 & c))
+                if (IS_LOW_SURROGATE(c))
                   {
                     // DC00- DFFF - Low Surrogate
-                    // N += ( L - DC00 )
-                    ucs4 |= (0x03FF & c);
+                    // N = (H - D800) *400 + 10000 + ( L - DC00 )
+                    PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
 
                     // 0001 0000-001F FFFF
                     *out++ = 0xF0 | (char)(ucs4 >> 18);
diff --git a/mozilla/xpcom/string/src/nsReadableUtils.cpp b/mozilla/xpcom/string/src/nsReadableUtils.cpp
index c10def95e75..b10881c6520 100755
--- a/mozilla/xpcom/string/src/nsReadableUtils.cpp
+++ b/mozilla/xpcom/string/src/nsReadableUtils.cpp
@@ -1166,3 +1166,19 @@ CompareUTF8toUTF16(const nsASingleFragmentCString& aUTF8String,
 
     return 0;
   }
+
+NS_COM
+void
+AppendUCS4ToUTF16(PRUint32 aSource, nsAString& aDest)
+  {
+    NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char");
+    if (IS_IN_BMP(aSource))
+      {
+        aDest.Append(PRUnichar(aSource));
+      }
+    else
+      {
+        aDest.Append(H_SURROGATE(aSource));
+        aDest.Append(L_SURROGATE(aSource));
+      }
+  }