Added support to use the specified document charset when encoding a

document to HTML to Text. The charset information is first encoded
in XIF and then that information is used when interpretting the unicode
for output.

Added support to output character entity information which should address
bug 4709


git-svn-id: svn://10.0.0.236/trunk@29162 18797224-902f-48f8-a5cc-f745e15eee43
This commit is contained in:
kostello%netscape.com
1999-04-26 06:16:49 +00:00
parent 17d6c2ea7a
commit 1a6701bd85
12 changed files with 634 additions and 114 deletions

View File

@@ -36,6 +36,14 @@
#include "nsIParser.h"
#include "nsHTMLEntities.h"
#include "nsIUnicodeEncoder.h"
#include "nsICharsetAlias.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID);
static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID);
@@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult,
return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult);
}
/**
* Inits the encoder instance variable for the sink based on the charset
*
* @update gpk 4/21/99
* @param aCharset
* @return NS_xxx error result
*/
nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset)
{
nsresult res = NS_OK;
nsICharsetAlias* calias = nsnull;
res = nsServiceManager::GetService(kCharsetAliasCID,
kICharsetAliasIID,
(nsISupports**)&calias);
NS_ASSERTION( nsnull != calias, "cannot find charet alias");
nsAutoString charsetName = aCharset;
if( NS_SUCCEEDED(res) && (nsnull != calias))
{
res = calias->GetPreferred(aCharset, charsetName);
nsServiceManager::ReleaseService(kCharsetAliasCID, calias);
if(NS_FAILED(res))
{
// failed - unknown alias , fallback to ISO-8859-1
charsetName = "ISO-8859-1";
}
nsICharsetConverterManager * ccm = nsnull;
res = nsServiceManager::GetService(kCharsetConverterManagerCID,
kICharsetConverterManagerIID,
(nsISupports**)&ccm);
if(NS_SUCCEEDED(res) && (nsnull != ccm))
{
nsIUnicodeEncoder * encoder = nsnull;
res = ccm->GetUnicodeEncoder(&charsetName, &encoder);
if(NS_SUCCEEDED(res) && (nsnull != encoder))
{
NS_IF_RELEASE(mUnicodeEncoder);
mUnicodeEncoder = encoder;
}
nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm);
}
}
return res;
}
/**
* Construct a content sink stream.
* @update gess7/7/98
@@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead
mDoHeader = aDoHeader;
mBuffer = nsnull;
mBufferSize = 0;
mUnicodeEncoder = nsnull;
}
/**
@@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm
mDoHeader = aDoHeader;
mBuffer = nsnull;
mBufferSize = 0;
mUnicodeEncoder = nsnull;
}
@@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
const char* entity = nsnull;
PRUint32 offset = 0;
PRUint32 addedLength = 0;
nsAutoString data;
if (mUnicodeEncoder == nsnull)
InitEncoder("");
if (length > 0)
{
// Step 1. Convert anything that maps to character entity to
// the entity value
EnsureBufferSize(length);
for (PRInt32 i = 0; i < length; i++)
{
@@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
entity = UnicodeToEntity(ch);
if (entity)
{
PRUint32 size = strlen(entity);
addedLength += size;
EnsureBufferSize(length+addedLength+1);
mBuffer[offset++] = '&';
mBuffer[offset] = 0;
strcat(mBuffer,entity);
PRUint32 temp = offset + size;
while (offset < temp)
{
mBuffer[offset] = tolower(mBuffer[offset]);
offset++;
}
mBuffer[offset++] = ';';
mBuffer[offset] = 0;
nsAutoString temp(entity);
temp.ToLowerCase();
data.Append('&');
data.Append(temp);
data.Append(';');
}
else if (ch < 128)
else
{
mBuffer[offset++] = (unsigned char)ch;
mBuffer[offset] = 0;
data.Append(ch);
}
}
}
// Step 2. Run the result through the converter
length = data.Length();
EnsureBufferSize(length);
PRInt32 bufferLength = mBufferSize;
mUnicodeEncoder->Reset();
nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength);
mBuffer[bufferLength] = 0;
PRInt32 temp = bufferLength;
if (NS_SUCCEEDED(result))
result = mUnicodeEncoder->Finish(mBuffer,&temp);
}
}
@@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
* @return
*/
nsHTMLContentSinkStream::~nsHTMLContentSinkStream() {
NS_IF_RELEASE(mUnicodeEncoder);
mOutput=0; //we don't own the stream we're given; just forget it.
}
@@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream
key.ToUpperCase();
key.ToCString(mBuffer,sizeof(gBuffer)-1);
EnsureBufferSize(key.Length());
key.ToCString(mBuffer,mBufferSize);
aStream << " " << mBuffer << char(kEqual);
mColPos += 1 + strlen(mBuffer) + 1;
@@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){
AddStartTag(aNode,aStream);
mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown;
}
if (type == eHTMLTag_text)
else if (type == eHTMLTag_entity)
{
const nsString& entity = aNode.GetText();
UnicodeToHTMLString(entity);
aStream << '&' << mBuffer << ';';
mColPos += entity.Length() + 2;
}
else if (type == eHTMLTag_text)
{
const nsString& text = aNode.GetText();
if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE)
@@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){
*/
NS_IMETHODIMP
nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){
if(mOutput) {
AddStartTag(aNode,*mOutput);
// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType();
if(mOutput)
{
const nsString& name = aNode.GetText();
if (name.Equals("XIF_DOC_INFO"))
{
PRInt32 count=aNode.GetAttributeCount();
for(PRInt32 i=0;i<count;i++)
{
const nsString& key=aNode.GetKeyAt(i);
const nsString& value=aNode.GetValueAt(i);
if (key.Equals("charset"))
InitEncoder(value);
}
}
else
{
AddStartTag(aNode,*mOutput);
}
}
return NS_OK;
}