From 83734eae5030fa271f7d36dd8d71a0eef5f37c55 Mon Sep 17 00:00:00 2001 From: "ftang%netscape.com" Date: Mon, 30 Aug 1999 22:25:17 +0000 Subject: [PATCH] 1. detect BOM, 2. Implement XML 1.0 Appendex F, 3. move hack from nsParser.cpp to nsScanner.cpp so we won't damage UCS2 data git-svn-id: svn://10.0.0.236/trunk@45206 18797224-902f-48f8-a5cc-f745e15eee43 --- mozilla/htmlparser/src/nsParser.cpp | 127 +++++++++++++++++++- mozilla/htmlparser/src/nsScanner.cpp | 10 ++ mozilla/parser/htmlparser/src/nsParser.cpp | 127 +++++++++++++++++++- mozilla/parser/htmlparser/src/nsScanner.cpp | 10 ++ 4 files changed, 272 insertions(+), 2 deletions(-) diff --git a/mozilla/htmlparser/src/nsParser.cpp b/mozilla/htmlparser/src/nsParser.cpp index 957dbd77b99..f8dd177eacd 100644 --- a/mozilla/htmlparser/src/nsParser.cpp +++ b/mozilla/htmlparser/src/nsParser.cpp @@ -16,7 +16,8 @@ * Reserved. */ - +#define DEBUG_XMLENCODING +#define XMLENCODING_PEEKBYTES 64 #include "nsParser.h" #include "nsIContentSink.h" @@ -1086,6 +1087,109 @@ nsresult nsParser::OnStartRequest(nsIURI* aURL, const char *aSourceType) return NS_OK; } + +#define UCS2_BE "X-ISO-10646-UCS-2-BE" +#define UCS2_LE "X-ISO-10646-UCS-2-LE" +#define UCS4_BE "X-ISO-10646-UCS-4-BE" +#define UCS4_LE "X-ISO-10646-UCS-4-LE" +#define UCS4_2143 "X-ISO-10646-UCS-4-2143" +#define UCS4_3412 "X-ISO-10646-UCS-4-3412" + +static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, + nsString& oCharset, nsCharsetSource& oCharsetSource) +{ + oCharsetSource= kCharsetFromAutoDetection; + oCharset = ""; + // see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting + // for details + switch(aBytes[0]) + { + case 0x00: + if(0x00==aBytes[1]) { + // 00 00 + if((0x00==aBytes[2]) && (0x3C==aBytes[3])) { + // 00 00 00 3C UCS-4, big-endian machine (1234 order) + oCharset = UCS4_BE; + } else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) { + // 00 00 3C 00 UCS-4, unusual octet order (2143) + oCharset = UCS4_2143; + } + } else if(0x3C==aBytes[1]) { + // 00 3C + if((0x00==aBytes[2]) && (0x00==aBytes[3])) { + // 00 3C 00 00 UCS-4, unusual octet order (3412) + oCharset = UCS4_3412; + } else if((0x3C==aBytes[2]) && (0x3F==aBytes[3])) { + // 00 3C 00 3F UTF-16, big-endian, no Byte Order Mark + oCharset = UCS2_BE; // should change to UTF-16BE + } + } + break; + case 0x3C: + if(0x00==aBytes[1]) { + // 3C 00 + if((0x00==aBytes[2]) && (0x00==aBytes[3])) { + // 3C 00 00 00 UCS-4, little-endian machine (4321 order) + oCharset = UCS4_LE; + } else if((0x3F==aBytes[2]) && (0x00==aBytes[3])) { + // 3C 00 3F 00 UTF-16, little-endian, no Byte Order Mark + oCharset = UCS2_LE; // should change to UTF-16LE + } + } else if((0x3C==aBytes[0]) && (0x3F==aBytes[1]) && + (0x78==aBytes[2]) && (0x6D==aBytes[3]) && + (0 == PL_strncmp(" XMLENCODING_PEEKBYTES)? + XMLENCODING_PEEKBYTES: + aLen)); + PRInt32 xmlDeclEnd = firstXbytes.Find("?>", PR_FALSE, 13); + // 27 == strlen("0) { + firstXbytes.Mid(oCharset,(encStart+1), count); + oCharsetSource= kCharsetFromMetaTag; + } + } + } + } + } + } + break; + case 0xFE: + if(0xFF==aBytes[1]) { + // FE FF + // UTF-16, big-endian + oCharset = UCS2_BE; // should change to UTF-16BE + } + break; + case 0xFF: + if(0xFE==aBytes[1]) { + // FF FE + // UTF-16, little-endian + oCharset = UCS2_LE; // should change to UTF-16LE + } + break; + // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) { + // We do not care EBCIDIC here.... + // } + // break; + } // switch + return oCharset.Length() > 0; +} + + /** * * @@ -1126,13 +1230,33 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin int theStartPos=0; nsresult result=NS_OK; + PRBool needCheckFirst4Bytes = + ((0 == sourceOffset) && (mCharsetSource0) && (aLength>theTotalRead) && (NS_OK==result)) { result = pIStream->Read(mParserContext->mTransferBuffer, aLength, &theNumRead); if((result == NS_OK) && (theNumRead>0)) { + if(needCheckFirst4Bytes && (theNumRead >= 4)) { + nsCharsetSource guessSource; + nsAutoString guess(""); + + needCheckFirst4Bytes = PR_FALSE; + if(detectByteOrderMark((const unsigned char*)mParserContext->mTransferBuffer, + theNumRead, guess, guessSource)) + { +#ifdef DEBUG_XMLENCODING + printf("xmlencoding detect- %s\n", guess.ToNewCString()); +#endif + this->SetDocumentCharset(guess, guessSource); + mParserContext->mScanner->SetDocumentCharset(guess, guessSource); + } + } theTotalRead+=theNumRead; if(mParserFilter) mParserFilter->RawBuffer(mParserContext->mTransferBuffer, &theNumRead); +#if 0 + // The following Hack have moved to nsScanner.cpp + // Remove that Hack if you feel this hack is not necessary // XXX Hack --- NULL character(s) is(are) seen in the middle of the buffer!!! // For now, I'm conditioning the raw buffer by removing the unwanted null chars. // Problem could be NECKO related @@ -1141,6 +1265,7 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin if(mParserContext->mTransferBuffer[i]==kNullCh) mParserContext->mTransferBuffer[i]=kSpace; } +#endif #ifdef NS_DEBUG int index=0; diff --git a/mozilla/htmlparser/src/nsScanner.cpp b/mozilla/htmlparser/src/nsScanner.cpp index b22f867ed3e..52c8dba59d0 100644 --- a/mozilla/htmlparser/src/nsScanner.cpp +++ b/mozilla/htmlparser/src/nsScanner.cpp @@ -257,6 +257,16 @@ PRBool nsScanner::Append(const char* aBuffer, PRUint32 aLen){ PRInt32 unicharLength = unicharBufLen; res = mUnicodeDecoder->Convert(unichars, 0, &unicharLength,aBuffer, 0, &srcLength ); unichars[unicharLength]=0; //add this since the unicode converters can't be trusted to do so. + + + // Move the nsParser.cpp 00 -> space hack to here so + // it won't break UCS2 file + // Hack Start + for(PRInt32 i=0;i XMLENCODING_PEEKBYTES)? + XMLENCODING_PEEKBYTES: + aLen)); + PRInt32 xmlDeclEnd = firstXbytes.Find("?>", PR_FALSE, 13); + // 27 == strlen("0) { + firstXbytes.Mid(oCharset,(encStart+1), count); + oCharsetSource= kCharsetFromMetaTag; + } + } + } + } + } + } + break; + case 0xFE: + if(0xFF==aBytes[1]) { + // FE FF + // UTF-16, big-endian + oCharset = UCS2_BE; // should change to UTF-16BE + } + break; + case 0xFF: + if(0xFE==aBytes[1]) { + // FF FE + // UTF-16, little-endian + oCharset = UCS2_LE; // should change to UTF-16LE + } + break; + // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) { + // We do not care EBCIDIC here.... + // } + // break; + } // switch + return oCharset.Length() > 0; +} + + /** * * @@ -1126,13 +1230,33 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin int theStartPos=0; nsresult result=NS_OK; + PRBool needCheckFirst4Bytes = + ((0 == sourceOffset) && (mCharsetSource0) && (aLength>theTotalRead) && (NS_OK==result)) { result = pIStream->Read(mParserContext->mTransferBuffer, aLength, &theNumRead); if((result == NS_OK) && (theNumRead>0)) { + if(needCheckFirst4Bytes && (theNumRead >= 4)) { + nsCharsetSource guessSource; + nsAutoString guess(""); + + needCheckFirst4Bytes = PR_FALSE; + if(detectByteOrderMark((const unsigned char*)mParserContext->mTransferBuffer, + theNumRead, guess, guessSource)) + { +#ifdef DEBUG_XMLENCODING + printf("xmlencoding detect- %s\n", guess.ToNewCString()); +#endif + this->SetDocumentCharset(guess, guessSource); + mParserContext->mScanner->SetDocumentCharset(guess, guessSource); + } + } theTotalRead+=theNumRead; if(mParserFilter) mParserFilter->RawBuffer(mParserContext->mTransferBuffer, &theNumRead); +#if 0 + // The following Hack have moved to nsScanner.cpp + // Remove that Hack if you feel this hack is not necessary // XXX Hack --- NULL character(s) is(are) seen in the middle of the buffer!!! // For now, I'm conditioning the raw buffer by removing the unwanted null chars. // Problem could be NECKO related @@ -1141,6 +1265,7 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin if(mParserContext->mTransferBuffer[i]==kNullCh) mParserContext->mTransferBuffer[i]=kSpace; } +#endif #ifdef NS_DEBUG int index=0; diff --git a/mozilla/parser/htmlparser/src/nsScanner.cpp b/mozilla/parser/htmlparser/src/nsScanner.cpp index b22f867ed3e..52c8dba59d0 100644 --- a/mozilla/parser/htmlparser/src/nsScanner.cpp +++ b/mozilla/parser/htmlparser/src/nsScanner.cpp @@ -257,6 +257,16 @@ PRBool nsScanner::Append(const char* aBuffer, PRUint32 aLen){ PRInt32 unicharLength = unicharBufLen; res = mUnicodeDecoder->Convert(unichars, 0, &unicharLength,aBuffer, 0, &srcLength ); unichars[unicharLength]=0; //add this since the unicode converters can't be trusted to do so. + + + // Move the nsParser.cpp 00 -> space hack to here so + // it won't break UCS2 file + // Hack Start + for(PRInt32 i=0;i