/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- * * The contents of this file are subject to the Netscape Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/NPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code is Mozilla Communicator client code. * * The Initial Developer of the Original Code is Netscape Communications * Corporation. Portions created by Netscape are * Copyright (C) 1998 Netscape Communications Corporation. All * Rights Reserved. * * Contributor(s): * Greg Kostello (original structure) * Akkana Peck * Daniel Bratell * Ben Bucksch * Pierre Phaneuf */ /** * MODULE NOTES: * * This file declares the concrete TXT ContentSink class. * This class is used during the parsing process as the * primary interface between the parser and the content * model. */ #include "nsHTMLToTXTSinkStream.h" #include "nsHTMLTokens.h" #include "nsString.h" #include "nsIParser.h" #include "nsHTMLEntities.h" #include "nsXIFDTD.h" #include "prprf.h" // For PR_snprintf() #include "nsIDocumentEncoder.h" // for output flags #include "nsIUnicodeEncoder.h" #include "nsICharsetAlias.h" #include "nsIServiceManager.h" #include "nsICharsetConverterManager.h" #include "nsIOutputStream.h" #include "nsFileStream.h" static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); const PRInt32 gTabSize=4; const PRInt32 gOLNumberWidth = 3; const PRInt32 gIndentSizeList = (gTabSize > gOLNumberWidth+3) ? gTabSize: gOLNumberWidth+3; // Indention of non-first lines of ul and ol static PRBool IsInline(eHTMLTags aTag); static PRBool IsBlockLevel(eHTMLTags aTag); /** * Inits the encoder instance variable for the sink based on the charset * * @update gpk 4/21/99 * @param aCharset * @return NS_xxx error result */ nsresult nsHTMLToTXTSinkStream::InitEncoder(const nsString& aCharset) { nsresult res = NS_OK; // If the converter is ucs2, then do not use a converter if (aCharset.EqualsWithConversion("ucs2")) { NS_IF_RELEASE(mUnicodeEncoder); return res; } nsICharsetAlias* calias = nsnull; res = nsServiceManager::GetService(kCharsetAliasCID, kICharsetAliasIID, (nsISupports**)&calias); NS_ASSERTION( nsnull != calias, "cannot find charset alias"); nsAutoString charsetName = aCharset; if( NS_SUCCEEDED(res) && (nsnull != calias)) { res = calias->GetPreferred(aCharset, charsetName); nsServiceManager::ReleaseService(kCharsetAliasCID, calias); if(NS_FAILED(res)) { // failed - unknown alias , fallback to ISO-8859-1 charsetName.AssignWithConversion("ISO-8859-1"); } nsICharsetConverterManager * ccm = nsnull; res = nsServiceManager::GetService(kCharsetConverterManagerCID, NS_GET_IID(nsICharsetConverterManager), (nsISupports**)&ccm); if(NS_SUCCEEDED(res) && (nsnull != ccm)) { nsIUnicodeEncoder * encoder = nsnull; res = ccm->GetUnicodeEncoder(&charsetName, &encoder); if(NS_SUCCEEDED(res) && (nsnull != encoder)) { NS_IF_RELEASE(mUnicodeEncoder); mUnicodeEncoder = encoder; } nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm); } } return res; } /** * This method gets called as part of our COM-like interfaces. * Its purpose is to create an interface to parser object * of some type. * * @update gpk02/03/99 * @param nsIID id of object to discover * @param aInstancePtr ptr to newly discovered interface * @return NS_xxx result code */ nsresult nsHTMLToTXTSinkStream::QueryInterface(const nsIID& aIID, void** aInstancePtr) { if (NULL == aInstancePtr) { return NS_ERROR_NULL_POINTER; } if(aIID.Equals(NS_GET_IID(nsISupports))) { *aInstancePtr = (nsIContentSink*)(this); } else if(aIID.Equals(NS_GET_IID(nsIContentSink))) { *aInstancePtr = (nsIContentSink*)(this); } else if(aIID.Equals(NS_GET_IID(nsIHTMLContentSink))) { *aInstancePtr = (nsIHTMLContentSink*)(this); } else if(aIID.Equals(NS_GET_IID(nsIHTMLToTXTSinkStream))) { *aInstancePtr = (nsIHTMLToTXTSinkStream*)(this); } else { *aInstancePtr=0; return NS_NOINTERFACE; } NS_ADDREF_THIS(); return NS_OK; } NS_IMPL_ADDREF(nsHTMLToTXTSinkStream) NS_IMPL_RELEASE(nsHTMLToTXTSinkStream) // Someday may want to make this non-const: static const PRUint32 TagStackSize = 500; static const PRUint32 OLStackSize = 100; /** * Construct a content sink stream. * @update gpk02/03/99 * @param * @return */ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() { NS_INIT_REFCNT(); mColPos = 0; mIndent = 0; mCiteQuoteLevel = 0; mDoFragment = PR_FALSE; mBufferSize = 0; mBufferLength = 0; mBuffer = nsnull; mUnicodeEncoder = nsnull; mWrapColumn = 72; // XXX magic number, we expect someone to reset this // Flow mEmptyLines=1; // The start of the document is an "empty line" in itself, mInWhitespace = PR_TRUE; mPreFormatted = PR_FALSE; mCacheLine = PR_FALSE; // initialize the tag stack to zero: mTagStack = new nsHTMLTag[TagStackSize]; mTagStackIndex = 0; // initialize the OL stack, where numbers for ordered lists are kept: mOLStack = new PRInt32[OLStackSize]; mOLStackIndex = 0; } /** * * @update gpk02/03/99 * @param * @return */ nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() { NS_WARN_IF_FALSE(mCurrentLine.Length() == 0, "Buffer not flushed! Probably illegal input to class."); if(mBuffer) delete[] mBuffer; delete[] mTagStack; delete[] mOLStack; NS_IF_RELEASE(mUnicodeEncoder); } /** * * @update gpk04/30/99 * @param * @return */ NS_IMETHODIMP nsHTMLToTXTSinkStream::Initialize(nsIOutputStream* aOutStream, nsString* aOutString, PRUint32 aFlags) { mStream = aOutStream; mString = aOutString; mFlags = aFlags; return NS_OK; } /** * * @update gpk04/30/99 * @param * @return */ NS_IMETHODIMP nsHTMLToTXTSinkStream::SetCharsetOverride(const nsString* aCharset) { if (aCharset) { mCharsetOverride = *aCharset; InitEncoder(mCharsetOverride); } return NS_OK; } /** * This method gets called by the parser when it encounters * a title tag and wants to set the document title in the sink. * * @update gpk02/03/99 * @param nsString reference to new title value * @return PR_TRUE if successful. */ NS_IMETHODIMP nsHTMLToTXTSinkStream::SetTitle(const nsString& aValue) { return NS_OK; } /** * All these HTML-specific methods may be called, or may not, * depending on whether the parser is parsing XIF or HTML. * So we can't depend on them; instead, we have Open/CloseContainer * do all the specialized work, and the html-specific Open/Close * methods must call the more general methods. * Since there are so many of them, make a macro: */ #define USE_GENERAL_OPEN_METHOD(opentag) \ NS_IMETHODIMP \ nsHTMLToTXTSinkStream::opentag(const nsIParserNode& aNode) \ { return OpenContainer(aNode); } #define USE_GENERAL_CLOSE_METHOD(closetag) \ NS_IMETHODIMP \ nsHTMLToTXTSinkStream::closetag(const nsIParserNode& aNode) \ { return CloseContainer(aNode); } USE_GENERAL_OPEN_METHOD(OpenHTML) USE_GENERAL_CLOSE_METHOD(CloseHTML) USE_GENERAL_OPEN_METHOD(OpenHead) USE_GENERAL_CLOSE_METHOD(CloseHead) USE_GENERAL_OPEN_METHOD(OpenBody) USE_GENERAL_CLOSE_METHOD(CloseBody) USE_GENERAL_OPEN_METHOD(OpenForm) USE_GENERAL_CLOSE_METHOD(CloseForm) USE_GENERAL_OPEN_METHOD(OpenMap) USE_GENERAL_CLOSE_METHOD(CloseMap) USE_GENERAL_OPEN_METHOD(OpenFrameset) USE_GENERAL_CLOSE_METHOD(CloseFrameset) NS_IMETHODIMP nsHTMLToTXTSinkStream::DoFragment(PRBool aFlag) { mDoFragment = aFlag; return NS_OK; } /** * This gets called when handling illegal contents, especially * in dealing with tables. This method creates a new context. * * @update 04/04/99 harishd * @param aPosition - The position from where the new context begins. */ NS_IMETHODIMP nsHTMLToTXTSinkStream::BeginContext(PRInt32 aPosition) { return NS_OK; } /** * This method terminates any new context that got created by * BeginContext and switches back to the main context. * * @update 04/04/99 harishd * @param aPosition - Validates the end of a context. */ NS_IMETHODIMP nsHTMLToTXTSinkStream::EndContext(PRInt32 aPosition) { return NS_OK; } /** * This gets called by the parser when you want to add * a PI node to the current container in the content * model. * * @updated gpk02/03/99 * @param * @return */ NS_IMETHODIMP nsHTMLToTXTSinkStream::AddProcessingInstruction(const nsIParserNode& aNode){ return NS_OK; } /** * This gets called by the parser when it encounters * a DOCTYPE declaration in the HTML document. */ NS_IMETHODIMP nsHTMLToTXTSinkStream::AddDocTypeDecl(const nsIParserNode& aNode, PRInt32 aMode) { return NS_OK; } /** * This gets called by the parser when you want to add * a comment node to the current container in the content * model. * * @updated gpk02/03/99 * @param * @return */ NS_IMETHODIMP nsHTMLToTXTSinkStream::AddComment(const nsIParserNode& aNode) { // Skip comments in plaintext output return NS_OK; } NS_IMETHODIMP nsHTMLToTXTSinkStream::GetValueOfAttribute(const nsIParserNode& aNode, char* aMatchKey, nsString& aValueRet) { nsAutoString matchKey; matchKey.AssignWithConversion(aMatchKey); PRInt32 count=aNode.GetAttributeCount(); for (PRInt32 i=0;i can turn on cacheing unless it's already preformatted if(!(mFlags & nsIDocumentEncoder::OutputPreformatted) && ((mFlags & nsIDocumentEncoder::OutputFormatted) || (mFlags & nsIDocumentEncoder::OutputWrap))) { mCacheLine = PR_TRUE; } // Try to figure out here whether we have a // preformatted style attribute. // // Trigger on the presence of a "-moz-pre-wrap" in the // style attribute. That's a very simplistic way to do // it, but better than nothing. // Also set mWrapColumn to the value given there // (which arguably we should only do if told to do so). nsString style; PRInt32 whitespace; if(NS_SUCCEEDED(GetValueOfAttribute(aNode, "style", style)) && (-1 != (whitespace = style.Find("white-space:")))) /* DELETEME: What, if the style is defined in an external stylesheet? */ { if (-1 != style.Find("-moz-pre-wrap", PR_TRUE, whitespace)) { #ifdef DEBUG_preformatted printf("Set mPreFormatted based on style moz-pre-wrap\n"); #endif mPreFormatted = PR_TRUE; mCacheLine = PR_TRUE; PRInt32 widthOffset = style.Find("width:"); if (widthOffset >= 0) { // We have to search for the ch before the semicolon, // not for the semicolon itself, because nsString::ToInteger() // considers 'c' to be a valid numeric char (even if radix=10) // but then gets confused if it sees it next to the number // when the radix specified was 10, and returns an error code. PRInt32 semiOffset = style.Find("ch", widthOffset+6); PRInt32 length = (semiOffset > 0 ? semiOffset - widthOffset - 6 : style.Length() - widthOffset); nsString widthstr; style.Mid(widthstr, widthOffset+6, length); PRInt32 err; PRInt32 col = widthstr.ToInteger(&err); if (NS_SUCCEEDED(err)) { SetWrapColumn((PRUint32)col); #ifdef DEBUG_preformatted printf("Set wrap column to %d based on style\n", mWrapColumn); #endif } } } else if (-1 != style.Find("pre", PR_TRUE, whitespace)) { #ifdef DEBUG_preformatted printf("Set mPreFormatted based on style pre\n"); #endif mPreFormatted = PR_TRUE; mCacheLine = PR_TRUE; SetWrapColumn(0); } } else { mPreFormatted = PR_FALSE; mCacheLine = PR_TRUE; // Cache lines unless something else tells us not to } return NS_OK; } if (!DoOutput()) return NS_OK; if (type == eHTMLTag_p || type == eHTMLTag_pre) EnsureVerticalSpace(1); // Should this be 0 in unformatted case? // Else make sure we'll separate block level tags, // even if we're about to leave before doing any other formatting. // Oddly, I can't find a case where this actually makes any difference. //else if (IsBlockLevel(type)) // EnsureVerticalSpace(0); // The rest of this routine is formatted output stuff, // which we should skip if we're not formatted: if (!(mFlags & nsIDocumentEncoder::OutputFormatted)) return NS_OK; if (type == eHTMLTag_ul) { // Indent here to support nested list, which aren't included in li :-( EnsureVerticalSpace(1); // Must end the current line before we change indent. mIndent += gIndentSizeList; } else if (type == eHTMLTag_ol) { EnsureVerticalSpace(1); // Must end the current line before we change indent. if (mOLStackIndex < OLStackSize) mOLStack[mOLStackIndex++] = 1; // XXX should get it from the node! mIndent += gIndentSizeList; // see ul } else if (type == eHTMLTag_li) { if (mTagStackIndex > 1 && mTagStack[mTagStackIndex-2] == eHTMLTag_ol) { if (mOLStackIndex > 0) // This is what nsBulletFrame does for OLs: mInIndentString.AppendInt(mOLStack[mOLStackIndex-1]++, 10); else mInIndentString.AppendWithConversion("#"); mInIndentString.AppendWithConversion('.'); } else mInIndentString.AppendWithConversion('*'); mInIndentString.AppendWithConversion(' '); } else if (type == eHTMLTag_blockquote) { EnsureVerticalSpace(0); // Find out whether it's a type=cite, and insert "> " instead. // Eventually we should get the value of the pref controlling citations, // and handle AOL-style citations as well. // If we want to support RFC 2646 (and we do!) we have to have: // >>>> text // >>> fdfd // when a mail is sent. nsString value; if (NS_SUCCEEDED(GetValueOfAttribute(aNode, "type", value)) && value.StripChars("\"").EqualsWithConversion("cite", PR_TRUE)) mCiteQuoteLevel++; else mIndent += gTabSize; // Check for some maximum value? } else if (type == eHTMLTag_a) { nsAutoString url; if (NS_SUCCEEDED(GetValueOfAttribute(aNode, "href", url)) && !url.IsEmpty()) mURL = url.StripChars("\""); } else if (type == eHTMLTag_img) { nsAutoString url; if (NS_SUCCEEDED(GetValueOfAttribute(aNode, "src", url)) && !url.IsEmpty()) { nsAutoString temp, desc; if (NS_SUCCEEDED(GetValueOfAttribute(aNode, "alt", desc)) && !desc.IsEmpty()) { temp.AppendWithConversion(" ("); temp += desc.StripChars("\""); temp.AppendWithConversion(" <"); temp += url.StripChars("\""); temp.AppendWithConversion(">) "); } else { temp.AppendWithConversion(" <"); temp += url.StripChars("\""); temp.AppendWithConversion("> "); } Write(temp); } } else if (type == eHTMLTag_sup) Write( NS_ConvertToString("^") ); // I don't know a plain text representation of sub else if (type == eHTMLTag_strong || type == eHTMLTag_b) Write( NS_ConvertToString("*") ); else if (type == eHTMLTag_em || type == eHTMLTag_i) Write( NS_ConvertToString("/") ); else if (type == eHTMLTag_u) Write( NS_ConvertToString("_") ); return NS_OK; } /** * This method is used to close a generic container. * * @update 07/12/98 gpk * @param nsIParserNode reference to parser node interface * @return PR_TRUE if successful. */ NS_IMETHODIMP nsHTMLToTXTSinkStream::CloseContainer(const nsIParserNode& aNode) { eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); #ifdef DEBUG_bratell printf("CloseContainer: %d ", type); #endif if (mTagStackIndex > 0) --mTagStackIndex; // End current line if we're ending a block level tag if (IsBlockLevel(type)) { if((type == eHTMLTag_body) || (type == eHTMLTag_html)) { // We want the output to end with a new line, // but in preformatted areas like text fields, // we can't emit newlines that weren't there. // So add the newline only in the case of formatted output. if (mFlags & nsIDocumentEncoder::OutputFormatted) EnsureVerticalSpace(0); else FlushLine(); } else if ((type == eHTMLTag_tr) || (type == eHTMLTag_li) || (type == eHTMLTag_pre) || (type == eHTMLTag_blockquote)) { EnsureVerticalSpace(0); } else if (type != eHTMLTag_script) { // All other blocks get 1 vertical space after them // in formatted mode, otherwise 0. // This is hard. Sometimes 0 is a better number, but // how to know? EnsureVerticalSpace((mFlags & nsIDocumentEncoder::OutputFormatted) ? 1 : 0); } } else if (type == eHTMLTag_td) { // We are after a table cell an thus maybe between two cells. // Something should be done to avoid the two cells to be written // together. This really need some intelligence about how the // contents in the cell looks. // Fow now, I will only add a SPACE. Could be a TAB or something // else but I'm not sure everything can handle the TAB so SPACE // seems like a better solution. if(!mInWhitespace) { // Maybe add something else? Several spaces? A TAB? SPACE+TAB? if(mCacheLine) { AddToLine(NS_ConvertToString(" ").GetUnicode(), 1); } else { nsAutoString space; space.AssignWithConversion(" "); WriteSimple(space); } mInWhitespace = PR_TRUE; } } // The rest of this routine is formatted output stuff, // which we should skip if we're not formatted: if (!(mFlags & nsIDocumentEncoder::OutputFormatted)) return NS_OK; if (type == eHTMLTag_ul) { mIndent -= gIndentSizeList; } else if (type == eHTMLTag_ol) { FlushLine(); // Doing this after decreasing OLStackIndex would be wrong. --mOLStackIndex; mIndent -= gIndentSizeList; } else if (type == eHTMLTag_blockquote) { FlushLine(); if (mCiteQuoteLevel>0) mCiteQuoteLevel--; else if(mIndent >= gTabSize) mIndent -= gTabSize; } else if (type == eHTMLTag_a) { // these brackets must stay here if (!mURL.IsEmpty()) { nsAutoString temp; temp.AssignWithConversion(" <"); temp += mURL; temp.AppendWithConversion(">"); Write(temp); mURL.Truncate(); } } else if (type == eHTMLTag_sup) Write( NS_ConvertToString(" ") ); else if (type == eHTMLTag_strong || type == eHTMLTag_b) Write( NS_ConvertToString("*") ); else if (type == eHTMLTag_em || type == eHTMLTag_i) Write( NS_ConvertToString("/") ); else if (type == eHTMLTag_u) Write( NS_ConvertToString("_") ); return NS_OK; } /** * This method is used to add a leaf to the currently * open container. * * @update 07/12/98 gpk * @param nsIParserNode reference to parser node interface * @return PR_TRUE if successful. */ NS_IMETHODIMP nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode) { #ifdef DEBUG_bratell printf("Addleaf: %d (%d) ", (eHTMLTags)aNode.GetNodeType(),mFlags); #endif // If we don't want any output, just return if (!DoOutput()) return NS_OK; eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); nsString text = aNode.GetText(); #ifdef DEBUG_bratell printf(" '%s' ", text.ToNewCString()); #endif if (mTagStackIndex > 1 && mTagStack[mTagStackIndex-2] == eHTMLTag_select) { // Don't output the contents of SELECT elements; // Might be nice, eventually, to output just the selected element. return NS_OK; } else if (mTagStackIndex > 0 && mTagStack[mTagStackIndex-1] == eHTMLTag_script) { // Don't output the contents of