Auto charset detection support.

git-svn-id: svn://10.0.0.236/trunk@33459 18797224-902f-48f8-a5cc-f745e15eee43
1999-06-03 01:08:25 +00:00 · 1999-06-03 01:08:25 +00:00 · dac458ca9f
commit dac458ca9f
parent d30d8feb8f
4 changed files with 240 additions and 23 deletions
--- a/mozilla/mailnews/mime/src/comi18n.cpp
+++ b/mozilla/mailnews/mime/src/comi18n.cpp
@ -754,7 +754,7 @@ char * utf8_mime_encode_mail_address(char *charset, const char *src, int maxLine
          return NULL;
        }
        // utf-8 to mail charset conversion (or iso-8859-1 in case of us-ascii).
-        if (MIME_ConvertCharset("utf-8", !PL_strcasecmp(charset, "us-ascii") ? "iso-8859-1" : charset, 
+        if (MIME_ConvertCharset(PR_FALSE, "utf-8", !PL_strcasecmp(charset, "us-ascii") ? "iso-8859-1" : charset, 
                                (const char*) begin, (const PRInt32) len, &buf1, (PRInt32 *) &iBufLen)) {
          PR_FREEIF(srcbuf);
          PR_FREEIF(retbuf);
@ -1254,6 +1254,185 @@ static PRInt32 INTL_ConvertFromUnicode(const char* to_charset, const void* uniBu
  return NS_SUCCEEDED(res) ? 0 : -1;
 }
 ////////////////////////////////////////////////////////////////////////////////
+#define USE_NONXPCOM_AUTODETECTION
+// TODO: This part should be entirly replaced by XPCOM version of charset detection.
+#ifdef USE_NONXPCOM_AUTODETECTION
+class nsCharsetDetect {
+public:
+  PRBool AutoCharsetDetectionAvailable(const nsString& aCharset);
+  nsresult AutoCharsetDetectBuffer(const char* aBuffer, const PRInt32 aLen, PRBool& bDetected,
+                                   const nsString& aCharsetIn, nsString& aCharsetDetected);
+};
+					/* values for EUC shift chars	*/
+#define SS2		0x8E		/* Single Shift 2		*/
+#define SS3		0x8F		/* Single Shift 3		*/
+#define IsRoman(c)			((c) < 0x80)
+#define IsSJIS2ndByte(c)	(((c) > 0x3F) && ((c) < 0xFD))
+#define IsLoSJIS2ndByte(c)	(((c) > 0x3F) && ((c) < 0xA1))
+#define IsHiSJIS2ndByte(c)	(((c) > 0xA0) && ((c) < 0xFD))
+#define IsEUCJPKana(b1)		(((b1) > 0xA0) && ((b1) < 0xE0))
+#define IsEUCJPKanji(b1or2)	(((b1or2) > 0xA0) && ((b1or2) < 0xFF))
+
+#define	YES		1
+#define NO		0
+#define	MAYBE	-1
+
+static int
+isSJIS(const unsigned char *cp, PRInt32 len)
+{
+	while (len) {
+		if (IsRoman(*cp)) {
+			cp++, len--;
+		} else if (*cp == 0x80) {		/* illegal SJIS 1st byte			*/
+			return NO;
+		} else if ((*cp < 0xA0)) {		/* byte 1 of 2byte SJIS 1st range	*/
+			if (len > 1) {
+				if (IsSJIS2ndByte(cp[1])) {
+					if ((*cp != 0x8E && *cp != 0x8F) || (*(cp+1) <= 0xA0))
+						return YES;
+					cp += 2, len -= 2;	/* valid 2 byte SJIS				*/
+				} else {
+					return NO;			/* invalid SJIS	2nd byte			*/
+				}
+			} else
+				break;						/* buffer ended w/1of2 byte SJIS */
+		} else if (*cp == 0xA0) {			/* illegal EUCJP byte		*/
+#if ALLOW_NBSP
+			cp++, len--; /* allow nbsp */
+#endif
+		} else if (*cp < 0xE0) {		/* SJIS half-width kana				*/
+			cp++, len--;
+		} else if (*cp < 0xF0) {		/* byte 1 of 2byte SJIS	 2nd range	*/
+			if (len > 1) {
+				if (IsSJIS2ndByte(cp[1])) {
+					cp += 2, len -= 2;	/* valid 2 byte SJIS				*/
+				} else {
+					return NO;			/* invalid SJIS						*/
+				}
+			} else
+				break;					/* buffer ended w/1of2 byte SJIS	*/
+		} else {
+			return NO;					/* invalid SJIS 1st byte			*/
+		}
+	}
+	return MAYBE;						/* No illegal SJIS values found		*/
+}
+
+static int
+isEUCJP(const unsigned char *cp, PRInt32 len)
+{
+	while (len) {
+		if (IsRoman(*cp)) {			/* Roman						*/
+			cp++, len--;
+		} else if (*cp == SS2) {		/* EUCJP JIS201 half-width kana */
+			if (len > 1) {
+				if (IsEUCJPKana(cp[1]))
+					cp += 2, len -= 2;		/* valid half-width kana */
+				else
+					return NO;				/* invalid 2of3 byte EUC */ 
+			} else
+				break;						/* buffer ended w/1of2 byte EUC	*/
+		} else if (*cp == SS3) {			/* EUCJP JIS212					*/
+			 if (len > 1) {
+			 	if (IsEUCJPKanji(cp[1])) {
+			 		if (len > 2) {
+				 		if (IsEUCJPKanji(cp[2]))
+							cp += 2, len -= 2;	/* valid 3 byte EUCJP		*/
+						else
+							return NO;		/* invalid 3of3 byte EUCJP	*/
+					} else
+						break;				/* buffer ended w/2of3 byte EUCJP */
+				} else
+					return NO;				/* invalid 2of3 byte EUCJP	*/
+			} else
+				break;						/* buffer ended w/1of3 byte EUCJP */
+		} else if (*cp == 0xA0) {			/* illegal EUCJP byte		*/
+#if ALLOW_NBSP
+			cp++, len--; /* allow nbsp */
+#else
+			return NO;
+#endif
+		} else if (*cp < 0xF0) {		/* EUCJP JIS208 (overlaps SJIS)		*/
+			if (len > 1) {
+			 	if (IsEUCJPKanji(cp[1]))
+					cp += 2, len -= 2;		/* valid 2 byte EUCJP		*/
+				else
+					return NO;				/* invalid 2of2 byte EUCJP	*/
+			} else
+				break;						/* buffer ended w/1of2 byte EUCJP */
+		} else if (*cp < 0xFF) {		/* EUCJP JIS208 only:			*/
+			if (len > 1) {
+			 	if (IsEUCJPKanji(cp[1]))
+					return YES;			/* valid 2 byte EUCJP, invalid SJIS	*/
+				else
+					return NO;				/* invalid 2of2 byte EUCJP	*/
+			} else
+				break;						/* buffer ended w/1of2 byte EUCJP */
+		} else {
+			return NO;					/* invalid EUCJP 1st byte: 0xFF	*/
+		}
+	}
+	return MAYBE;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PRBool nsCharsetDetect::AutoCharsetDetectionAvailable(const nsString& aCharset)
+{
+  if (!aCharset.EqualsIgnoreCase("Shift_JIS") &&
+      !aCharset.EqualsIgnoreCase("EUC-JP") &&
+      !aCharset.EqualsIgnoreCase("ISO-2022-JP")) {
+    return PR_FALSE;
+  }
+  return PR_TRUE;
+}
+
+nsresult nsCharsetDetect::AutoCharsetDetectBuffer(const char* aBuffer, const PRInt32 aLen, PRBool& bDetected,
+                                                  const nsString& aCharsetIn, nsString& aCharsetDetected)
+{
+  PRBool doAutoDetect = PR_FALSE;
+
+  if (!AutoCharsetDetectionAvailable(aCharsetIn)) {
+    bDetected = PR_FALSE;
+    return NS_OK;
+  }
+  // check 7 bit only or ESC
+  for (int i = 0; i < aLen; i++) {
+    if ((unsigned char) aBuffer[i] > 127 || aBuffer[i] == 0x1B) {
+      if (aBuffer[i] == 0x1B) {
+        aCharsetDetected.SetString("ISO-2022-JP");
+      }
+      doAutoDetect = PR_TRUE;
+      break;
+    }
+  }
+
+  if (!doAutoDetect) {
+    aCharsetDetected.SetString("ISO-8859-1");
+  }
+  else {
+   if (!aCharsetDetected.EqualsIgnoreCase("ISO-2022-JP")) {
+    // use old japanese auto detect code
+      int euc, sjis;
+      aCharsetDetected.SetString("ISO-8859-1");
+      euc = isEUCJP((unsigned char *) aBuffer, aLen);
+      if (YES == euc || MAYBE == euc) {
+        aCharsetDetected.SetString("EUC-JP");
+      }
+      else {
+        sjis = isSJIS((unsigned char *) aBuffer, aLen);
+        if (YES == sjis || MAYBE == sjis) {
+          aCharsetDetected.SetString("Shift_JIS");
+        }
+      }
+    }
+  }
+  bDetected = PR_TRUE;
+
+  return NS_OK;
+}
+#endif//USE_NONXPCOM_AUTODETECTION
+
 class MimeCharsetConverterClass {
 public:
  MimeCharsetConverterClass();
@ -1272,7 +1451,7 @@ public:
 protected:
  nsIUnicodeDecoder * GetUnicodeDecoder() {return (mAutoDetect && NULL != mDecoderDetected) ? mDecoderDetected : mDecoder;}
  nsIUnicodeEncoder * GetUnicodeEncoder() {return mEncoder;}
-  PRBool NeedCharsetConversion(const char* from_charset, const char* to_charset);
+  PRBool NeedCharsetConversion(const nsString& from_charset, const nsString& to_charset);

 private:
  nsIUnicodeDecoder *mDecoder;          // decoder (convert to unicode)  
@ -1282,6 +1461,8 @@ private:
                                        // (-1 for no limit)
  PRInt32 mNumChars;                    // accumulated number of characters converted in bytes
  PRBool mAutoDetect;                   // true if apply auto detection
+  nsString mInputCharset;               // input charset for auto detection hint as well as need conversion check
+  nsString mOutputCharset;              // output charset for need conversion check
 };

 MimeCharsetConverterClass::MimeCharsetConverterClass()
@ -1306,8 +1487,20 @@ PRInt32 MimeCharsetConverterClass::Initialize(const char* from_charset, const ch
 {
  nsresult res;

+  NS_ASSERTION(NULL == mEncoder, "No reinitialization allowed.");
+
+  mInputCharset.SetString(from_charset);     // remember input charset for a hint
+  mOutputCharset.SetString(to_charset);      // remember output charset
+  mAutoDetect = autoDetect;
+  mMaxNumCharsDetect = maxNumCharsDetect;
+
+  // Check if auto detection is available for the input charset
+  nsCharsetDetect aCharsetDetect;
+  if (mAutoDetect && !aCharsetDetect.AutoCharsetDetectionAvailable(from_charset)) {
+    mAutoDetect = PR_FALSE;
+  }
  // No need to do the conversion then do not create converters. 
-  if (!autoDetect && !NeedCharsetConversion(from_charset, to_charset)) {
+  if (!mAutoDetect && !NeedCharsetConversion(mInputCharset, mOutputCharset)) {
    return 0;
  }

@ -1320,7 +1513,7 @@ PRInt32 MimeCharsetConverterClass::Initialize(const char* from_charset, const ch
    // create a decoder (conv to unicode), ok if failed if we do auto detection
    aCharset.SetString(from_charset);
    res = ccm->GetUnicodeDecoder(&aCharset, &mDecoder);
-    if (NS_FAILED(res) && !autoDetect) {
+    if (NS_FAILED(res) && !mAutoDetect) {
      return -1;
    }
    // create an encoder (conv from unicode)
@ -1334,14 +1527,13 @@ PRInt32 MimeCharsetConverterClass::Initialize(const char* from_charset, const ch
    return -1;
  }

-  mAutoDetect = autoDetect;
-  mMaxNumCharsDetect = maxNumCharsDetect;
-
  return 0;
 }

 PRInt32 MimeCharsetConverterClass::Convert(const char* inBuffer, const PRInt32 inLength, char** outBuffer, PRInt32* outLength)
 {
+  nsresult res;
+
  // Encoder is not available, duplicate the input.
  if (NULL == mEncoder) {
    *outBuffer = (char *) PR_Malloc(inLength+1);
@ -1359,7 +1551,34 @@ PRInt32 MimeCharsetConverterClass::Convert(const char* inBuffer, const PRInt32 i

  // try auto detection for this string
  if (mAutoDetect && (mMaxNumCharsDetect == -1 || mMaxNumCharsDetect > mNumChars)) {
-    // Call NeedCharsetConversion for detected charset, avoid create decoder.
+    nsCharsetDetect aCharsetDetect;   //TODO: replace this by XPCOM interface
+    PRBool bDetected;
+    nsString aCharsetDetected;
+    res = aCharsetDetect.AutoCharsetDetectBuffer(inBuffer, inLength, bDetected, mInputCharset, aCharsetDetected);
+    if (NS_SUCCEEDED(res) && bDetected) {
+      // Check if need a conversion.
+      if (!NeedCharsetConversion(aCharsetDetected, mOutputCharset)) {
+        *outBuffer = (char *) PR_Malloc(inLength+1);
+        if (NULL != *outBuffer) {
+          nsCRT::memcpy(*outBuffer, inBuffer, inLength);
+          *outLength = inLength;
+          (*outBuffer)[inLength] = '\0';
+          return 0;
+        }
+        return -1;
+      }
+      else {
+        NS_WITH_SERVICE(nsICharsetConverterManager, ccm, kCharsetConverterManagerCID, &res); 
+        if (NS_SUCCEEDED(res) && (nsnull != ccm)) {
+          NS_IF_RELEASE(mDecoderDetected);
+          mDecoderDetected = nsnull;
+          res = ccm->GetUnicodeDecoder(&aCharsetDetected, &mDecoderDetected);
+          if (NS_SUCCEEDED(res)) {
+            decoder = mDecoderDetected;   // use detected charset instead
+          }
+        }
+      }
+    }
  }

  // update the total so far
@ -1378,7 +1597,6 @@ PRInt32 MimeCharsetConverterClass::Convert(const char* inBuffer, const PRInt32 i
  }

  // do the conversion
-  nsresult res;
  PRUnichar *unichars;
  PRInt32 unicharLength;
  PRInt32 srcLen = inLength;
@ -1417,17 +1635,15 @@ PRInt32 MimeCharsetConverterClass::Convert(const char* inBuffer, const PRInt32 i
  return NS_SUCCEEDED(res) ? 0 : -1;
 }

-PRBool MimeCharsetConverterClass::NeedCharsetConversion(const char* from_charset, const char* to_charset)
+PRBool MimeCharsetConverterClass::NeedCharsetConversion(const nsString& from_charset, const nsString& to_charset)
 {
-  if (nsnull == from_charset || nsnull == to_charset || 
-      '\0' == *from_charset || '\0' == *to_charset) 
+  if (from_charset.Length() == 0 || to_charset.Length() == 0) 
    return PR_FALSE;
-  else if (!PL_strcasecmp(from_charset, to_charset)) {
+  else if (from_charset.EqualsIgnoreCase(to_charset)) {
    return PR_FALSE;
  }
-  else if ((!PL_strcasecmp(from_charset, "us-ascii") && !PL_strcasecmp(to_charset, "utf-8")) ||
-      (!PL_strcasecmp(from_charset, "utf-8") && !PL_strcasecmp(to_charset, "us-ascii")))
-  {
+  else if ((from_charset.EqualsIgnoreCase("us-ascii") && to_charset.EqualsIgnoreCase("utf-8")) ||
+      (from_charset.EqualsIgnoreCase("utf-8") && to_charset.EqualsIgnoreCase("us-ascii"))) {
    return PR_FALSE;
  }
  return PR_TRUE;
@ -1461,16 +1677,16 @@ PRInt32 MIME_ConvertString(const char* from_charset, const char* to_charset,
                           const char* inCstring, char** outCstring)
 {
  PRInt32 outLength;
-  return MIME_ConvertCharset(from_charset, to_charset, inCstring, PL_strlen(inCstring), outCstring, &outLength);
+  return MIME_ConvertCharset(PR_FALSE, from_charset, to_charset, inCstring, PL_strlen(inCstring), outCstring, &outLength);
 }

-PRInt32 MIME_ConvertCharset(const char* from_charset, const char* to_charset,
+PRInt32 MIME_ConvertCharset(const PRBool autoDetection, const char* from_charset, const char* to_charset,
                            const char* inBuffer, const PRInt32 inLength, char** outBuffer, PRInt32* outLength)
 {
  MimeCharsetConverterClass aMimeCharsetConverterClass;
  PRInt32 res;

-  res = aMimeCharsetConverterClass.Initialize(from_charset, to_charset, PR_FALSE, -1);
+  res = aMimeCharsetConverterClass.Initialize(from_charset, to_charset, autoDetection, -1);

  if (res != -1) {
    res = aMimeCharsetConverterClass.Convert(inBuffer, inLength, outBuffer, outLength);
--- a/mozilla/mailnews/mime/src/comi18n.h
+++ b/mozilla/mailnews/mime/src/comi18n.h
@ -83,6 +83,7 @@ PRInt32 MIME_ConvertString(const char* from_charset, const char* to_charset,
 * since this will not save a state info (i.e. converter instance will be created/destroyed for every call).
 * The caller should instanticate converters by XPCOM for that purpose.
 *
+ * @param autoDetection [IN] True if apply auto charset detection.
 * @param from_charset[IN] A charset name in C string.
 * @param to_charset  [IN] A charset name in C string.
 * @param inBuffer    [IN] Input buffer to convert.
@ -91,7 +92,7 @@ PRInt32 MIME_ConvertString(const char* from_charset, const char* to_charset,
 * @param outLength   [OUT] Converted buffer length is set.
 * @return            0 is success, otherwise error.
 */
-PRInt32 MIME_ConvertCharset(const char* from_charset, const char* to_charset,
+PRInt32 MIME_ConvertCharset(const PRBool autoDetection, const char* from_charset, const char* to_charset,
                            const char* inBuffer, const PRInt32 inLength, char** outBuffer, PRInt32* outLength);

 /**
--- a/mozilla/mailnews/mime/src/mimemoz2.cpp
+++ b/mozilla/mailnews/mime/src/mimemoz2.cpp
@ -154,7 +154,7 @@ mime_convert_charset (const char *input_line, PRInt32 input_length,
  // Now do conversion to UTF-8 for output
  char  *convertedString = NULL;
  PRInt32 convertedStringLen;
-  PRInt32 res = MIME_ConvertCharset(input_charset, "UTF-8", input_line, input_length, 
+  PRInt32 res = MIME_ConvertCharset(PR_TRUE, input_charset, "UTF-8", input_line, input_length, 
                                    &convertedString, &convertedStringLen);
  if (res != 0)
  {
--- a/mozilla/mailnews/mime/src/nsMsgHeaderParser.cpp
+++ b/mozilla/mailnews/mime/src/nsMsgHeaderParser.cpp
@ -99,7 +99,7 @@ nsresult nsMsgHeaderParser::ParseHeaderAddresses (const char *charset, const cha
      s += len;
    }
    // convert array of strings
-    if (MIME_ConvertCharset("UTF-8", CHARSET(charset), *names, len_all, &outStrings, &outStrLen) == 0) {
+    if (MIME_ConvertCharset(PR_FALSE, "UTF-8", CHARSET(charset), *names, len_all, &outStrings, &outStrLen) == 0) {
      PR_Free(*names);
      *names = outStrings;
    }
@ -113,7 +113,7 @@ nsresult nsMsgHeaderParser::ParseHeaderAddresses (const char *charset, const cha
      s += len;
    }
    // convert array of strings
-    if (MIME_ConvertCharset("UTF-8", CHARSET(charset), *addresses, len_all, &outStrings, &outStrLen) == 0) {
+    if (MIME_ConvertCharset(PR_FALSE, "UTF-8", CHARSET(charset), *addresses, len_all, &outStrings, &outStrLen) == 0) {
      PR_Free(*addresses);
      *addresses = outStrings;
    }