diff --git a/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp b/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp index c7743085442..ac20b9d799b 100644 --- a/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp +++ b/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp @@ -36,6 +36,14 @@ #include "nsIParser.h" #include "nsHTMLEntities.h" + + +#include "nsIUnicodeEncoder.h" +#include "nsICharsetAlias.h" +#include "nsIServiceManager.h" +#include "nsICharsetConverterManager.h" + + static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID); static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); @@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult, return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult); } + + +/** + * Inits the encoder instance variable for the sink based on the charset + * + * @update gpk 4/21/99 + * @param aCharset + * @return NS_xxx error result + */ +nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset) +{ + + nsresult res = NS_OK; + + nsICharsetAlias* calias = nsnull; + res = nsServiceManager::GetService(kCharsetAliasCID, + kICharsetAliasIID, + (nsISupports**)&calias); + + NS_ASSERTION( nsnull != calias, "cannot find charet alias"); + nsAutoString charsetName = aCharset; + if( NS_SUCCEEDED(res) && (nsnull != calias)) + { + res = calias->GetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + + /** * Construct a content sink stream. * @update gess7/7/98 @@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } /** @@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } @@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) const char* entity = nsnull; PRUint32 offset = 0; PRUint32 addedLength = 0; + nsAutoString data; + + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { + // Step 1. Convert anything that maps to character entity to + // the entity value EnsureBufferSize(length); for (PRInt32 i = 0; i < length; i++) { @@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) entity = UnicodeToEntity(ch); if (entity) { - PRUint32 size = strlen(entity); - addedLength += size; - EnsureBufferSize(length+addedLength+1); - mBuffer[offset++] = '&'; - mBuffer[offset] = 0; - strcat(mBuffer,entity); - - PRUint32 temp = offset + size; - while (offset < temp) - { - mBuffer[offset] = tolower(mBuffer[offset]); - offset++; - } - mBuffer[offset++] = ';'; - mBuffer[offset] = 0; + nsAutoString temp(entity); + + temp.ToLowerCase(); + data.Append('&'); + data.Append(temp); + data.Append(';'); } - else if (ch < 128) + else { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; + data.Append(ch); } } - } + + // Step 2. Run the result through the converter + length = data.Length(); + EnsureBufferSize(length); + PRInt32 bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); + } } @@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) * @return */ nsHTMLContentSinkStream::~nsHTMLContentSinkStream() { + NS_IF_RELEASE(mUnicodeEncoder); mOutput=0; //we don't own the stream we're given; just forget it. } @@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream key.ToUpperCase(); - - key.ToCString(mBuffer,sizeof(gBuffer)-1); + EnsureBufferSize(key.Length()); + key.ToCString(mBuffer,mBufferSize); aStream << " " << mBuffer << char(kEqual); mColPos += 1 + strlen(mBuffer) + 1; @@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){ AddStartTag(aNode,aStream); mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown; } - if (type == eHTMLTag_text) + else if (type == eHTMLTag_entity) + { + const nsString& entity = aNode.GetText(); + UnicodeToHTMLString(entity); + aStream << '&' << mBuffer << ';'; + mColPos += entity.Length() + 2; + } + else if (type == eHTMLTag_text) { const nsString& text = aNode.GetText(); if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE) @@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){ */ NS_IMETHODIMP nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){ - if(mOutput) { - AddStartTag(aNode,*mOutput); -// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType(); + if(mOutput) + { + const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;iGetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + @@ -117,6 +172,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } /** @@ -133,6 +189,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } @@ -145,6 +202,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { mOutput=0; //we don't own the stream we're given; just forget it. delete [] mBuffer; + NS_IF_RELEASE(mUnicodeEncoder); } @@ -409,42 +467,40 @@ void nsHTMLToTXTSinkStream::EnsureBufferSize(PRInt32 aNewSize) } + void nsHTMLToTXTSinkStream::UnicodeToTXTString(const nsString& aSrc) { + + #define CH_NBSP 160 -#define CH_QUOT 34 -#define CH_AMP 38 -#define CH_LT 60 -#define CH_GT 62 PRInt32 length = aSrc.Length(); - PRUnichar ch; - const char* entity = nsnull; - PRUint32 offset = 0; - PRUint32 addedLength = 0; + nsresult result; + PRInt32 bufferLength; + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { EnsureBufferSize(length); - for (PRInt32 i = 0; i < length; i++) - { - ch = aSrc.CharAt(i); - switch (ch) - { - case CH_QUOT: ch = '"'; break; - case CH_AMP: ch = '&'; break; - case CH_GT: ch = '>'; break; - case CH_LT: ch = '<'; break; - case CH_NBSP: ch = ' '; break; - } + bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + result = mUnicodeEncoder->Convert(aSrc, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); - if (ch < 128) - { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; - } + + for (PRInt32 i = 0; i < bufferLength; i++) + { + if (mBuffer[i] == char(CH_NBSP)) + mBuffer[i] = ' '; } } + } @@ -483,6 +539,18 @@ nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream) mStrBuffer.Append(mBuffer); mColPos += text.Length(); } + else if (type == eHTMLTag_entity) + { + const nsString& text = aNode.GetText(); + UnicodeToTXTString(text); + PRInt32 entity = NS_EntityToUnicode(mBuffer); + if (entity < 256) + { + char ch = (char)entity; + aStream << ch; + mColPos++; + } + } else if (type == eHTMLTag_whitespace) { if (PR_TRUE) @@ -551,6 +619,18 @@ NS_IMETHODIMP nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){ eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;i"); - if (offset != -1) - aBuffer.Cut(0,offset); aContentType= kXIFTextContentType; result=ePrimaryDetect; } } + + nsString charset ="ISO-8859-1"; + PRInt32 offset; + offset = aBuffer.Find(kXIFDocInfo); + if(kNotFound!=offset) + { + offset = aBuffer.Find(kXIFCharset); + if (kNotFound!=offset) + { + PRInt32 start = aBuffer.Find('"',offset); + PRInt32 end = aBuffer.Find('"',start+1); + + if ((start != kNotFound) && (end != kNotFound)) + { + charset = ""; + for (PRInt32 i = start+1; i < end; i++) + { + PRUnichar ch = aBuffer[i]; + charset.Append(ch); + } + } + } + } + mCharset = charset; + return result; } @@ -638,6 +667,11 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { result = OpenContainer(node); break; + case eXIFTag_entity: + StartTopOfStack(); + ProcessEntityTag(node); + break; + case eXIFTag_content: StartTopOfStack(); mInContent = PR_TRUE; @@ -647,6 +681,10 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { ProcessEncodeTag(node); break; + case eXIFTag_document_info: + ProcessDocumentInfoTag(node); + break; + case eXIFTag_attr: AddAttribute(node); @@ -1355,8 +1393,8 @@ void nsXIFDTD::BeginStartTag(const nsIParserNode& aNode) if (type == eXIFTag_container) PushHTMLTag(tag,tagName); - CToken* token = new CStartToken(tagName); - nsCParserNode* node = new nsCParserNode(token); +// CToken* token = new CStartToken(tagName); +// nsCParserNode* node = new nsCParserNode(token); PushNodeAndToken(tagName); break; } @@ -1629,6 +1667,38 @@ void nsXIFDTD::ProcessEncodeTag(const nsIParserNode& aNode) } +void nsXIFDTD::ProcessEntityTag(const nsIParserNode& aNode) +{ + nsString value; + + if (GetAttribute(aNode,nsString("value"),value)) + { + CEntityToken* entity = new CEntityToken(value); + nsCParserNode node((CToken*)entity); + mSink->AddLeaf(node); + } +} + + +void nsXIFDTD::ProcessDocumentInfoTag(const nsIParserNode& aNode) +{ + nsString value; + nsString key("charset"); + + if (GetAttribute(aNode,key,value)) + { + PushNodeAndToken(nsString("XIF_DOC_INFO")); + CAttributeToken* attribute = new CAttributeToken(key,value); + nsIParserNode* top = PeekNode(); + if (top != nsnull) + ((nsCParserNode*)top)->AddAttribute(attribute); + + } +} + + + + /*** CSS Methods ****/ void nsXIFDTD::BeginCSSStyleSheet(const nsIParserNode& aNode) diff --git a/mozilla/htmlparser/src/nsXIFDTD.h b/mozilla/htmlparser/src/nsXIFDTD.h index d10224cbf3f..ce76bdf1a03 100644 --- a/mozilla/htmlparser/src/nsXIFDTD.h +++ b/mozilla/htmlparser/src/nsXIFDTD.h @@ -71,7 +71,9 @@ enum eXIFTags eXIFTag_css_stylesheet, eXIFTag_doctype, - eXIFTag_encode, + eXIFTag_document_info, + eXIFTag_encode, + eXIFTag_entity, eXIFTag_import, eXIFTag_leaf, eXIFTag_link, @@ -490,6 +492,8 @@ private: private: void ProcessEncodeTag(const nsIParserNode& aNode); + void ProcessEntityTag(const nsIParserNode& aNode); + void ProcessDocumentInfoTag(const nsIParserNode& aNode); void BeginCSSStyleSheet(const nsIParserNode& aNode); void EndCSSStyleSheet(const nsIParserNode& aNode); @@ -556,6 +560,7 @@ protected: PRBool mLowerCaseTags; PRBool mLowerCaseAttributes; nsITokenizer* mTokenizer; + nsString mCharset; }; diff --git a/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp b/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp index c7743085442..ac20b9d799b 100644 --- a/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp +++ b/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp @@ -36,6 +36,14 @@ #include "nsIParser.h" #include "nsHTMLEntities.h" + + +#include "nsIUnicodeEncoder.h" +#include "nsICharsetAlias.h" +#include "nsIServiceManager.h" +#include "nsICharsetConverterManager.h" + + static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID); static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); @@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult, return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult); } + + +/** + * Inits the encoder instance variable for the sink based on the charset + * + * @update gpk 4/21/99 + * @param aCharset + * @return NS_xxx error result + */ +nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset) +{ + + nsresult res = NS_OK; + + nsICharsetAlias* calias = nsnull; + res = nsServiceManager::GetService(kCharsetAliasCID, + kICharsetAliasIID, + (nsISupports**)&calias); + + NS_ASSERTION( nsnull != calias, "cannot find charet alias"); + nsAutoString charsetName = aCharset; + if( NS_SUCCEEDED(res) && (nsnull != calias)) + { + res = calias->GetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + + /** * Construct a content sink stream. * @update gess7/7/98 @@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } /** @@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } @@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) const char* entity = nsnull; PRUint32 offset = 0; PRUint32 addedLength = 0; + nsAutoString data; + + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { + // Step 1. Convert anything that maps to character entity to + // the entity value EnsureBufferSize(length); for (PRInt32 i = 0; i < length; i++) { @@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) entity = UnicodeToEntity(ch); if (entity) { - PRUint32 size = strlen(entity); - addedLength += size; - EnsureBufferSize(length+addedLength+1); - mBuffer[offset++] = '&'; - mBuffer[offset] = 0; - strcat(mBuffer,entity); - - PRUint32 temp = offset + size; - while (offset < temp) - { - mBuffer[offset] = tolower(mBuffer[offset]); - offset++; - } - mBuffer[offset++] = ';'; - mBuffer[offset] = 0; + nsAutoString temp(entity); + + temp.ToLowerCase(); + data.Append('&'); + data.Append(temp); + data.Append(';'); } - else if (ch < 128) + else { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; + data.Append(ch); } } - } + + // Step 2. Run the result through the converter + length = data.Length(); + EnsureBufferSize(length); + PRInt32 bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); + } } @@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) * @return */ nsHTMLContentSinkStream::~nsHTMLContentSinkStream() { + NS_IF_RELEASE(mUnicodeEncoder); mOutput=0; //we don't own the stream we're given; just forget it. } @@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream key.ToUpperCase(); - - key.ToCString(mBuffer,sizeof(gBuffer)-1); + EnsureBufferSize(key.Length()); + key.ToCString(mBuffer,mBufferSize); aStream << " " << mBuffer << char(kEqual); mColPos += 1 + strlen(mBuffer) + 1; @@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){ AddStartTag(aNode,aStream); mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown; } - if (type == eHTMLTag_text) + else if (type == eHTMLTag_entity) + { + const nsString& entity = aNode.GetText(); + UnicodeToHTMLString(entity); + aStream << '&' << mBuffer << ';'; + mColPos += entity.Length() + 2; + } + else if (type == eHTMLTag_text) { const nsString& text = aNode.GetText(); if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE) @@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){ */ NS_IMETHODIMP nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){ - if(mOutput) { - AddStartTag(aNode,*mOutput); -// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType(); + if(mOutput) + { + const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;iGetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + @@ -117,6 +172,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } /** @@ -133,6 +189,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } @@ -145,6 +202,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { mOutput=0; //we don't own the stream we're given; just forget it. delete [] mBuffer; + NS_IF_RELEASE(mUnicodeEncoder); } @@ -409,42 +467,40 @@ void nsHTMLToTXTSinkStream::EnsureBufferSize(PRInt32 aNewSize) } + void nsHTMLToTXTSinkStream::UnicodeToTXTString(const nsString& aSrc) { + + #define CH_NBSP 160 -#define CH_QUOT 34 -#define CH_AMP 38 -#define CH_LT 60 -#define CH_GT 62 PRInt32 length = aSrc.Length(); - PRUnichar ch; - const char* entity = nsnull; - PRUint32 offset = 0; - PRUint32 addedLength = 0; + nsresult result; + PRInt32 bufferLength; + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { EnsureBufferSize(length); - for (PRInt32 i = 0; i < length; i++) - { - ch = aSrc.CharAt(i); - switch (ch) - { - case CH_QUOT: ch = '"'; break; - case CH_AMP: ch = '&'; break; - case CH_GT: ch = '>'; break; - case CH_LT: ch = '<'; break; - case CH_NBSP: ch = ' '; break; - } + bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + result = mUnicodeEncoder->Convert(aSrc, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); - if (ch < 128) - { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; - } + + for (PRInt32 i = 0; i < bufferLength; i++) + { + if (mBuffer[i] == char(CH_NBSP)) + mBuffer[i] = ' '; } } + } @@ -483,6 +539,18 @@ nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream) mStrBuffer.Append(mBuffer); mColPos += text.Length(); } + else if (type == eHTMLTag_entity) + { + const nsString& text = aNode.GetText(); + UnicodeToTXTString(text); + PRInt32 entity = NS_EntityToUnicode(mBuffer); + if (entity < 256) + { + char ch = (char)entity; + aStream << ch; + mColPos++; + } + } else if (type == eHTMLTag_whitespace) { if (PR_TRUE) @@ -551,6 +619,18 @@ NS_IMETHODIMP nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){ eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;i"); - if (offset != -1) - aBuffer.Cut(0,offset); aContentType= kXIFTextContentType; result=ePrimaryDetect; } } + + nsString charset ="ISO-8859-1"; + PRInt32 offset; + offset = aBuffer.Find(kXIFDocInfo); + if(kNotFound!=offset) + { + offset = aBuffer.Find(kXIFCharset); + if (kNotFound!=offset) + { + PRInt32 start = aBuffer.Find('"',offset); + PRInt32 end = aBuffer.Find('"',start+1); + + if ((start != kNotFound) && (end != kNotFound)) + { + charset = ""; + for (PRInt32 i = start+1; i < end; i++) + { + PRUnichar ch = aBuffer[i]; + charset.Append(ch); + } + } + } + } + mCharset = charset; + return result; } @@ -638,6 +667,11 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { result = OpenContainer(node); break; + case eXIFTag_entity: + StartTopOfStack(); + ProcessEntityTag(node); + break; + case eXIFTag_content: StartTopOfStack(); mInContent = PR_TRUE; @@ -647,6 +681,10 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { ProcessEncodeTag(node); break; + case eXIFTag_document_info: + ProcessDocumentInfoTag(node); + break; + case eXIFTag_attr: AddAttribute(node); @@ -1355,8 +1393,8 @@ void nsXIFDTD::BeginStartTag(const nsIParserNode& aNode) if (type == eXIFTag_container) PushHTMLTag(tag,tagName); - CToken* token = new CStartToken(tagName); - nsCParserNode* node = new nsCParserNode(token); +// CToken* token = new CStartToken(tagName); +// nsCParserNode* node = new nsCParserNode(token); PushNodeAndToken(tagName); break; } @@ -1629,6 +1667,38 @@ void nsXIFDTD::ProcessEncodeTag(const nsIParserNode& aNode) } +void nsXIFDTD::ProcessEntityTag(const nsIParserNode& aNode) +{ + nsString value; + + if (GetAttribute(aNode,nsString("value"),value)) + { + CEntityToken* entity = new CEntityToken(value); + nsCParserNode node((CToken*)entity); + mSink->AddLeaf(node); + } +} + + +void nsXIFDTD::ProcessDocumentInfoTag(const nsIParserNode& aNode) +{ + nsString value; + nsString key("charset"); + + if (GetAttribute(aNode,key,value)) + { + PushNodeAndToken(nsString("XIF_DOC_INFO")); + CAttributeToken* attribute = new CAttributeToken(key,value); + nsIParserNode* top = PeekNode(); + if (top != nsnull) + ((nsCParserNode*)top)->AddAttribute(attribute); + + } +} + + + + /*** CSS Methods ****/ void nsXIFDTD::BeginCSSStyleSheet(const nsIParserNode& aNode) diff --git a/mozilla/parser/htmlparser/src/nsXIFDTD.h b/mozilla/parser/htmlparser/src/nsXIFDTD.h index d10224cbf3f..ce76bdf1a03 100644 --- a/mozilla/parser/htmlparser/src/nsXIFDTD.h +++ b/mozilla/parser/htmlparser/src/nsXIFDTD.h @@ -71,7 +71,9 @@ enum eXIFTags eXIFTag_css_stylesheet, eXIFTag_doctype, - eXIFTag_encode, + eXIFTag_document_info, + eXIFTag_encode, + eXIFTag_entity, eXIFTag_import, eXIFTag_leaf, eXIFTag_link, @@ -490,6 +492,8 @@ private: private: void ProcessEncodeTag(const nsIParserNode& aNode); + void ProcessEntityTag(const nsIParserNode& aNode); + void ProcessDocumentInfoTag(const nsIParserNode& aNode); void BeginCSSStyleSheet(const nsIParserNode& aNode); void EndCSSStyleSheet(const nsIParserNode& aNode); @@ -556,6 +560,7 @@ protected: PRBool mLowerCaseTags; PRBool mLowerCaseAttributes; nsITokenizer* mTokenizer; + nsString mCharset; };