From 1a6701bd859b2ecf6abb87dba6f6f6a6777856d3 Mon Sep 17 00:00:00 2001 From: "kostello%netscape.com" Date: Mon, 26 Apr 1999 06:16:49 +0000 Subject: [PATCH] Added support to use the specified document charset when encoding a document to HTML to Text. The charset information is first encoded in XIF and then that information is used when interpretting the unicode for output. Added support to output character entity information which should address bug 4709 git-svn-id: svn://10.0.0.236/trunk@29162 18797224-902f-48f8-a5cc-f745e15eee43 --- .../src/nsHTMLContentSinkStream.cpp | 145 +++++++++++++++--- .../htmlparser/src/nsHTMLContentSinkStream.h | 5 + .../htmlparser/src/nsHTMLToTXTSinkStream.cpp | 128 +++++++++++++--- .../htmlparser/src/nsHTMLToTXTSinkStream.h | 7 +- mozilla/htmlparser/src/nsXIFDTD.cpp | 82 +++++++++- mozilla/htmlparser/src/nsXIFDTD.h | 7 +- .../src/nsHTMLContentSinkStream.cpp | 145 +++++++++++++++--- .../htmlparser/src/nsHTMLContentSinkStream.h | 5 + .../htmlparser/src/nsHTMLToTXTSinkStream.cpp | 128 +++++++++++++--- .../htmlparser/src/nsHTMLToTXTSinkStream.h | 7 +- mozilla/parser/htmlparser/src/nsXIFDTD.cpp | 82 +++++++++- mozilla/parser/htmlparser/src/nsXIFDTD.h | 7 +- 12 files changed, 634 insertions(+), 114 deletions(-) diff --git a/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp b/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp index c7743085442..ac20b9d799b 100644 --- a/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp +++ b/mozilla/htmlparser/src/nsHTMLContentSinkStream.cpp @@ -36,6 +36,14 @@ #include "nsIParser.h" #include "nsHTMLEntities.h" + + +#include "nsIUnicodeEncoder.h" +#include "nsICharsetAlias.h" +#include "nsIServiceManager.h" +#include "nsICharsetConverterManager.h" + + static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID); static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); @@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult, return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult); } + + +/** + * Inits the encoder instance variable for the sink based on the charset + * + * @update gpk 4/21/99 + * @param aCharset + * @return NS_xxx error result + */ +nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset) +{ + + nsresult res = NS_OK; + + nsICharsetAlias* calias = nsnull; + res = nsServiceManager::GetService(kCharsetAliasCID, + kICharsetAliasIID, + (nsISupports**)&calias); + + NS_ASSERTION( nsnull != calias, "cannot find charet alias"); + nsAutoString charsetName = aCharset; + if( NS_SUCCEEDED(res) && (nsnull != calias)) + { + res = calias->GetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + + /** * Construct a content sink stream. * @update gess7/7/98 @@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } /** @@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } @@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) const char* entity = nsnull; PRUint32 offset = 0; PRUint32 addedLength = 0; + nsAutoString data; + + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { + // Step 1. Convert anything that maps to character entity to + // the entity value EnsureBufferSize(length); for (PRInt32 i = 0; i < length; i++) { @@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) entity = UnicodeToEntity(ch); if (entity) { - PRUint32 size = strlen(entity); - addedLength += size; - EnsureBufferSize(length+addedLength+1); - mBuffer[offset++] = '&'; - mBuffer[offset] = 0; - strcat(mBuffer,entity); - - PRUint32 temp = offset + size; - while (offset < temp) - { - mBuffer[offset] = tolower(mBuffer[offset]); - offset++; - } - mBuffer[offset++] = ';'; - mBuffer[offset] = 0; + nsAutoString temp(entity); + + temp.ToLowerCase(); + data.Append('&'); + data.Append(temp); + data.Append(';'); } - else if (ch < 128) + else { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; + data.Append(ch); } } - } + + // Step 2. Run the result through the converter + length = data.Length(); + EnsureBufferSize(length); + PRInt32 bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); + } } @@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) * @return */ nsHTMLContentSinkStream::~nsHTMLContentSinkStream() { + NS_IF_RELEASE(mUnicodeEncoder); mOutput=0; //we don't own the stream we're given; just forget it. } @@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream key.ToUpperCase(); - - key.ToCString(mBuffer,sizeof(gBuffer)-1); + EnsureBufferSize(key.Length()); + key.ToCString(mBuffer,mBufferSize); aStream << " " << mBuffer << char(kEqual); mColPos += 1 + strlen(mBuffer) + 1; @@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){ AddStartTag(aNode,aStream); mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown; } - if (type == eHTMLTag_text) + else if (type == eHTMLTag_entity) + { + const nsString& entity = aNode.GetText(); + UnicodeToHTMLString(entity); + aStream << '&' << mBuffer << ';'; + mColPos += entity.Length() + 2; + } + else if (type == eHTMLTag_text) { const nsString& text = aNode.GetText(); if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE) @@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){ */ NS_IMETHODIMP nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){ - if(mOutput) { - AddStartTag(aNode,*mOutput); -// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType(); + if(mOutput) + { + const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;iGetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + @@ -117,6 +172,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } /** @@ -133,6 +189,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } @@ -145,6 +202,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { mOutput=0; //we don't own the stream we're given; just forget it. delete [] mBuffer; + NS_IF_RELEASE(mUnicodeEncoder); } @@ -409,42 +467,40 @@ void nsHTMLToTXTSinkStream::EnsureBufferSize(PRInt32 aNewSize) } + void nsHTMLToTXTSinkStream::UnicodeToTXTString(const nsString& aSrc) { + + #define CH_NBSP 160 -#define CH_QUOT 34 -#define CH_AMP 38 -#define CH_LT 60 -#define CH_GT 62 PRInt32 length = aSrc.Length(); - PRUnichar ch; - const char* entity = nsnull; - PRUint32 offset = 0; - PRUint32 addedLength = 0; + nsresult result; + PRInt32 bufferLength; + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { EnsureBufferSize(length); - for (PRInt32 i = 0; i < length; i++) - { - ch = aSrc.CharAt(i); - switch (ch) - { - case CH_QUOT: ch = '"'; break; - case CH_AMP: ch = '&'; break; - case CH_GT: ch = '>'; break; - case CH_LT: ch = '<'; break; - case CH_NBSP: ch = ' '; break; - } + bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + result = mUnicodeEncoder->Convert(aSrc, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); - if (ch < 128) - { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; - } + + for (PRInt32 i = 0; i < bufferLength; i++) + { + if (mBuffer[i] == char(CH_NBSP)) + mBuffer[i] = ' '; } } + } @@ -483,6 +539,18 @@ nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream) mStrBuffer.Append(mBuffer); mColPos += text.Length(); } + else if (type == eHTMLTag_entity) + { + const nsString& text = aNode.GetText(); + UnicodeToTXTString(text); + PRInt32 entity = NS_EntityToUnicode(mBuffer); + if (entity < 256) + { + char ch = (char)entity; + aStream << ch; + mColPos++; + } + } else if (type == eHTMLTag_whitespace) { if (PR_TRUE) @@ -551,6 +619,18 @@ NS_IMETHODIMP nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){ eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;i"); - if (offset != -1) - aBuffer.Cut(0,offset); aContentType= kXIFTextContentType; result=ePrimaryDetect; } } + + nsString charset ="ISO-8859-1"; + PRInt32 offset; + offset = aBuffer.Find(kXIFDocInfo); + if(kNotFound!=offset) + { + offset = aBuffer.Find(kXIFCharset); + if (kNotFound!=offset) + { + PRInt32 start = aBuffer.Find('"',offset); + PRInt32 end = aBuffer.Find('"',start+1); + + if ((start != kNotFound) && (end != kNotFound)) + { + charset = ""; + for (PRInt32 i = start+1; i < end; i++) + { + PRUnichar ch = aBuffer[i]; + charset.Append(ch); + } + } + } + } + mCharset = charset; + return result; } @@ -638,6 +667,11 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { result = OpenContainer(node); break; + case eXIFTag_entity: + StartTopOfStack(); + ProcessEntityTag(node); + break; + case eXIFTag_content: StartTopOfStack(); mInContent = PR_TRUE; @@ -647,6 +681,10 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { ProcessEncodeTag(node); break; + case eXIFTag_document_info: + ProcessDocumentInfoTag(node); + break; + case eXIFTag_attr: AddAttribute(node); @@ -1355,8 +1393,8 @@ void nsXIFDTD::BeginStartTag(const nsIParserNode& aNode) if (type == eXIFTag_container) PushHTMLTag(tag,tagName); - CToken* token = new CStartToken(tagName); - nsCParserNode* node = new nsCParserNode(token); +// CToken* token = new CStartToken(tagName); +// nsCParserNode* node = new nsCParserNode(token); PushNodeAndToken(tagName); break; } @@ -1629,6 +1667,38 @@ void nsXIFDTD::ProcessEncodeTag(const nsIParserNode& aNode) } +void nsXIFDTD::ProcessEntityTag(const nsIParserNode& aNode) +{ + nsString value; + + if (GetAttribute(aNode,nsString("value"),value)) + { + CEntityToken* entity = new CEntityToken(value); + nsCParserNode node((CToken*)entity); + mSink->AddLeaf(node); + } +} + + +void nsXIFDTD::ProcessDocumentInfoTag(const nsIParserNode& aNode) +{ + nsString value; + nsString key("charset"); + + if (GetAttribute(aNode,key,value)) + { + PushNodeAndToken(nsString("XIF_DOC_INFO")); + CAttributeToken* attribute = new CAttributeToken(key,value); + nsIParserNode* top = PeekNode(); + if (top != nsnull) + ((nsCParserNode*)top)->AddAttribute(attribute); + + } +} + + + + /*** CSS Methods ****/ void nsXIFDTD::BeginCSSStyleSheet(const nsIParserNode& aNode) diff --git a/mozilla/htmlparser/src/nsXIFDTD.h b/mozilla/htmlparser/src/nsXIFDTD.h index d10224cbf3f..ce76bdf1a03 100644 --- a/mozilla/htmlparser/src/nsXIFDTD.h +++ b/mozilla/htmlparser/src/nsXIFDTD.h @@ -71,7 +71,9 @@ enum eXIFTags eXIFTag_css_stylesheet, eXIFTag_doctype, - eXIFTag_encode, + eXIFTag_document_info, + eXIFTag_encode, + eXIFTag_entity, eXIFTag_import, eXIFTag_leaf, eXIFTag_link, @@ -490,6 +492,8 @@ private: private: void ProcessEncodeTag(const nsIParserNode& aNode); + void ProcessEntityTag(const nsIParserNode& aNode); + void ProcessDocumentInfoTag(const nsIParserNode& aNode); void BeginCSSStyleSheet(const nsIParserNode& aNode); void EndCSSStyleSheet(const nsIParserNode& aNode); @@ -556,6 +560,7 @@ protected: PRBool mLowerCaseTags; PRBool mLowerCaseAttributes; nsITokenizer* mTokenizer; + nsString mCharset; }; diff --git a/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp b/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp index c7743085442..ac20b9d799b 100644 --- a/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp +++ b/mozilla/parser/htmlparser/src/nsHTMLContentSinkStream.cpp @@ -36,6 +36,14 @@ #include "nsIParser.h" #include "nsHTMLEntities.h" + + +#include "nsIUnicodeEncoder.h" +#include "nsICharsetAlias.h" +#include "nsIServiceManager.h" +#include "nsICharsetConverterManager.h" + + static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID); static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID); @@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult, return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult); } + + +/** + * Inits the encoder instance variable for the sink based on the charset + * + * @update gpk 4/21/99 + * @param aCharset + * @return NS_xxx error result + */ +nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset) +{ + + nsresult res = NS_OK; + + nsICharsetAlias* calias = nsnull; + res = nsServiceManager::GetService(kCharsetAliasCID, + kICharsetAliasIID, + (nsISupports**)&calias); + + NS_ASSERTION( nsnull != calias, "cannot find charet alias"); + nsAutoString charsetName = aCharset; + if( NS_SUCCEEDED(res) && (nsnull != calias)) + { + res = calias->GetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + + /** * Construct a content sink stream. * @update gess7/7/98 @@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } /** @@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm mDoHeader = aDoHeader; mBuffer = nsnull; mBufferSize = 0; + mUnicodeEncoder = nsnull; } @@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) const char* entity = nsnull; PRUint32 offset = 0; PRUint32 addedLength = 0; + nsAutoString data; + + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { + // Step 1. Convert anything that maps to character entity to + // the entity value EnsureBufferSize(length); for (PRInt32 i = 0; i < length; i++) { @@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) entity = UnicodeToEntity(ch); if (entity) { - PRUint32 size = strlen(entity); - addedLength += size; - EnsureBufferSize(length+addedLength+1); - mBuffer[offset++] = '&'; - mBuffer[offset] = 0; - strcat(mBuffer,entity); - - PRUint32 temp = offset + size; - while (offset < temp) - { - mBuffer[offset] = tolower(mBuffer[offset]); - offset++; - } - mBuffer[offset++] = ';'; - mBuffer[offset] = 0; + nsAutoString temp(entity); + + temp.ToLowerCase(); + data.Append('&'); + data.Append(temp); + data.Append(';'); } - else if (ch < 128) + else { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; + data.Append(ch); } } - } + + // Step 2. Run the result through the converter + length = data.Length(); + EnsureBufferSize(length); + PRInt32 bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); + } } @@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc) * @return */ nsHTMLContentSinkStream::~nsHTMLContentSinkStream() { + NS_IF_RELEASE(mUnicodeEncoder); mOutput=0; //we don't own the stream we're given; just forget it. } @@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream key.ToUpperCase(); - - key.ToCString(mBuffer,sizeof(gBuffer)-1); + EnsureBufferSize(key.Length()); + key.ToCString(mBuffer,mBufferSize); aStream << " " << mBuffer << char(kEqual); mColPos += 1 + strlen(mBuffer) + 1; @@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){ AddStartTag(aNode,aStream); mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown; } - if (type == eHTMLTag_text) + else if (type == eHTMLTag_entity) + { + const nsString& entity = aNode.GetText(); + UnicodeToHTMLString(entity); + aStream << '&' << mBuffer << ';'; + mColPos += entity.Length() + 2; + } + else if (type == eHTMLTag_text) { const nsString& text = aNode.GetText(); if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE) @@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){ */ NS_IMETHODIMP nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){ - if(mOutput) { - AddStartTag(aNode,*mOutput); -// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType(); + if(mOutput) + { + const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;iGetPreferred(aCharset, charsetName); + nsServiceManager::ReleaseService(kCharsetAliasCID, calias); + + if(NS_FAILED(res)) + { + // failed - unknown alias , fallback to ISO-8859-1 + charsetName = "ISO-8859-1"; + } + + nsICharsetConverterManager * ccm = nsnull; + res = nsServiceManager::GetService(kCharsetConverterManagerCID, + kICharsetConverterManagerIID, + (nsISupports**)&ccm); + if(NS_SUCCEEDED(res) && (nsnull != ccm)) + { + nsIUnicodeEncoder * encoder = nsnull; + res = ccm->GetUnicodeEncoder(&charsetName, &encoder); + if(NS_SUCCEEDED(res) && (nsnull != encoder)) + { + NS_IF_RELEASE(mUnicodeEncoder); + mUnicodeEncoder = encoder; + } + nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); + } + } + return res; +} + @@ -117,6 +172,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } /** @@ -133,6 +189,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { mDoOutput = PR_FALSE; mBufferSize = 0; mBuffer = nsnull; + mUnicodeEncoder = nsnull; } @@ -145,6 +202,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) { nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { mOutput=0; //we don't own the stream we're given; just forget it. delete [] mBuffer; + NS_IF_RELEASE(mUnicodeEncoder); } @@ -409,42 +467,40 @@ void nsHTMLToTXTSinkStream::EnsureBufferSize(PRInt32 aNewSize) } + void nsHTMLToTXTSinkStream::UnicodeToTXTString(const nsString& aSrc) { + + #define CH_NBSP 160 -#define CH_QUOT 34 -#define CH_AMP 38 -#define CH_LT 60 -#define CH_GT 62 PRInt32 length = aSrc.Length(); - PRUnichar ch; - const char* entity = nsnull; - PRUint32 offset = 0; - PRUint32 addedLength = 0; + nsresult result; + PRInt32 bufferLength; + + if (mUnicodeEncoder == nsnull) + InitEncoder(""); if (length > 0) { EnsureBufferSize(length); - for (PRInt32 i = 0; i < length; i++) - { - ch = aSrc.CharAt(i); - switch (ch) - { - case CH_QUOT: ch = '"'; break; - case CH_AMP: ch = '&'; break; - case CH_GT: ch = '>'; break; - case CH_LT: ch = '<'; break; - case CH_NBSP: ch = ' '; break; - } + bufferLength = mBufferSize; + + mUnicodeEncoder->Reset(); + result = mUnicodeEncoder->Convert(aSrc, &length, mBuffer, &bufferLength); + mBuffer[bufferLength] = 0; + PRInt32 temp = bufferLength; + if (NS_SUCCEEDED(result)) + result = mUnicodeEncoder->Finish(mBuffer,&temp); - if (ch < 128) - { - mBuffer[offset++] = (unsigned char)ch; - mBuffer[offset] = 0; - } + + for (PRInt32 i = 0; i < bufferLength; i++) + { + if (mBuffer[i] == char(CH_NBSP)) + mBuffer[i] = ' '; } } + } @@ -483,6 +539,18 @@ nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream) mStrBuffer.Append(mBuffer); mColPos += text.Length(); } + else if (type == eHTMLTag_entity) + { + const nsString& text = aNode.GetText(); + UnicodeToTXTString(text); + PRInt32 entity = NS_EntityToUnicode(mBuffer); + if (entity < 256) + { + char ch = (char)entity; + aStream << ch; + mColPos++; + } + } else if (type == eHTMLTag_whitespace) { if (PR_TRUE) @@ -551,6 +619,18 @@ NS_IMETHODIMP nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){ eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); const nsString& name = aNode.GetText(); + if (name.Equals("XIF_DOC_INFO")) + { + PRInt32 count=aNode.GetAttributeCount(); + for(PRInt32 i=0;i"); - if (offset != -1) - aBuffer.Cut(0,offset); aContentType= kXIFTextContentType; result=ePrimaryDetect; } } + + nsString charset ="ISO-8859-1"; + PRInt32 offset; + offset = aBuffer.Find(kXIFDocInfo); + if(kNotFound!=offset) + { + offset = aBuffer.Find(kXIFCharset); + if (kNotFound!=offset) + { + PRInt32 start = aBuffer.Find('"',offset); + PRInt32 end = aBuffer.Find('"',start+1); + + if ((start != kNotFound) && (end != kNotFound)) + { + charset = ""; + for (PRInt32 i = start+1; i < end; i++) + { + PRUnichar ch = aBuffer[i]; + charset.Append(ch); + } + } + } + } + mCharset = charset; + return result; } @@ -638,6 +667,11 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { result = OpenContainer(node); break; + case eXIFTag_entity: + StartTopOfStack(); + ProcessEntityTag(node); + break; + case eXIFTag_content: StartTopOfStack(); mInContent = PR_TRUE; @@ -647,6 +681,10 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) { ProcessEncodeTag(node); break; + case eXIFTag_document_info: + ProcessDocumentInfoTag(node); + break; + case eXIFTag_attr: AddAttribute(node); @@ -1355,8 +1393,8 @@ void nsXIFDTD::BeginStartTag(const nsIParserNode& aNode) if (type == eXIFTag_container) PushHTMLTag(tag,tagName); - CToken* token = new CStartToken(tagName); - nsCParserNode* node = new nsCParserNode(token); +// CToken* token = new CStartToken(tagName); +// nsCParserNode* node = new nsCParserNode(token); PushNodeAndToken(tagName); break; } @@ -1629,6 +1667,38 @@ void nsXIFDTD::ProcessEncodeTag(const nsIParserNode& aNode) } +void nsXIFDTD::ProcessEntityTag(const nsIParserNode& aNode) +{ + nsString value; + + if (GetAttribute(aNode,nsString("value"),value)) + { + CEntityToken* entity = new CEntityToken(value); + nsCParserNode node((CToken*)entity); + mSink->AddLeaf(node); + } +} + + +void nsXIFDTD::ProcessDocumentInfoTag(const nsIParserNode& aNode) +{ + nsString value; + nsString key("charset"); + + if (GetAttribute(aNode,key,value)) + { + PushNodeAndToken(nsString("XIF_DOC_INFO")); + CAttributeToken* attribute = new CAttributeToken(key,value); + nsIParserNode* top = PeekNode(); + if (top != nsnull) + ((nsCParserNode*)top)->AddAttribute(attribute); + + } +} + + + + /*** CSS Methods ****/ void nsXIFDTD::BeginCSSStyleSheet(const nsIParserNode& aNode) diff --git a/mozilla/parser/htmlparser/src/nsXIFDTD.h b/mozilla/parser/htmlparser/src/nsXIFDTD.h index d10224cbf3f..ce76bdf1a03 100644 --- a/mozilla/parser/htmlparser/src/nsXIFDTD.h +++ b/mozilla/parser/htmlparser/src/nsXIFDTD.h @@ -71,7 +71,9 @@ enum eXIFTags eXIFTag_css_stylesheet, eXIFTag_doctype, - eXIFTag_encode, + eXIFTag_document_info, + eXIFTag_encode, + eXIFTag_entity, eXIFTag_import, eXIFTag_leaf, eXIFTag_link, @@ -490,6 +492,8 @@ private: private: void ProcessEncodeTag(const nsIParserNode& aNode); + void ProcessEntityTag(const nsIParserNode& aNode); + void ProcessDocumentInfoTag(const nsIParserNode& aNode); void BeginCSSStyleSheet(const nsIParserNode& aNode); void EndCSSStyleSheet(const nsIParserNode& aNode); @@ -556,6 +560,7 @@ protected: PRBool mLowerCaseTags; PRBool mLowerCaseAttributes; nsITokenizer* mTokenizer; + nsString mCharset; };