Mozilla/mozilla/parser/htmlparser/src/nsHTMLTokenizer.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set sw=2 ts=2 et tw=78: */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Blake Kaplan <mrbkap@gmail.com>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */


/**
 * @file nsHTMLTokenizer.cpp
 * This is an implementation of the nsITokenizer interface.
 * This file contains the implementation of a tokenizer to tokenize an HTML
 * document. It attempts to do so, making tradeoffs between compatibility with
 * older parsers and the SGML specification. Note that most of the real
 * "tokenization" takes place in nsHTMLTokens.cpp.
 */

#include "nsIAtom.h"
#include "nsHTMLTokenizer.h"
#include "nsScanner.h"
#include "nsElementTable.h"
#include "CParserContext.h"
#include "nsReadableUtils.h"
#include "nsUnicharUtils.h"

/************************************************************************
  And now for the main class -- nsHTMLTokenizer...
 ************************************************************************/

/**
 * Satisfy the nsISupports interface.
 */
NS_IMPL_ISUPPORTS1(nsHTMLTokenizer, nsITokenizer)

/**
 * Default constructor
 *
 * @param  aParseMode The current mode the document is in (quirks, etc.)
 * @param  aDocType The document type of the current document
 * @param  aCommand What we are trying to do (view-source, parse a fragment, etc.)
 */
nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
                                 eParserDocType aDocType,
                                 eParserCommands aCommand,
                                 PRUint16 aFlags) :
  nsITokenizer(), mTokenDeque(0), mFlags(aFlags)
{
  if (aParseMode==eDTDMode_full_standards ||
      aParseMode==eDTDMode_almost_standards) {
    mFlags |= NS_IPARSER_FLAG_STRICT_MODE;
  }
  else if (aParseMode==eDTDMode_quirks)  {
    mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE;
  }
  else if (aParseMode==eDTDMode_autodetect) {
    mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE;
  }
  else {
    mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE;
  }

  if (aDocType==ePlainText) {
    mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
  }
  else if (aDocType==eXML) {
    mFlags |= NS_IPARSER_FLAG_XML;
  }
  else if (aDocType==eHTML_Quirks ||
           aDocType==eHTML3_Quirks ||
           aDocType==eHTML_Strict) {
    mFlags |= NS_IPARSER_FLAG_HTML;
  }

  mFlags |= (aCommand==eViewSource)? NS_IPARSER_FLAG_VIEW_SOURCE:NS_IPARSER_FLAG_VIEW_NORMAL;

  NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) ||
                (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE),
              "Why isn't this XML document going through our XML parser?");

  mTokenAllocator = nsnull;
  mTokenScanPos = 0;
}


/**
 * The destructor ensures that we don't leak any left over tokens.
 */
nsHTMLTokenizer::~nsHTMLTokenizer()
{
  if(mTokenDeque.GetSize()){
    CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
    mTokenDeque.ForEach(theDeallocator);
  }
}


/*******************************************************************
  Here begins the real working methods for the tokenizer.
 *******************************************************************/

/**
 * Adds a token onto the end of the deque if aResult is a successful result.
 * Otherwise, this function frees aToken and sets it to nsnull.
 *
 * @param aToken The token that wants to be added.
 * @param aResult The error code that will be used to determine if we actually
 *                want to push this token.
 * @param aDeque The deque we want to push aToken onto.
 * @param aTokenAllocator The allocator we use to free aToken in case aResult
 *                        is not a success code.
 */
/* static */
void nsHTMLTokenizer::AddToken(CToken*& aToken,
                               nsresult aResult,
                               nsDeque* aDeque,
                               nsTokenAllocator* aTokenAllocator)
{
  if(aToken && aDeque) {
    if(NS_SUCCEEDED(aResult)) {
      aDeque->Push(aToken);
    }
    else {
      IF_FREE(aToken, aTokenAllocator);
    }
  }
}

/**
 * Retrieve a pointer to the global token recycler...
 *
 * @return Pointer to recycler (or null)
 */
nsTokenAllocator* nsHTMLTokenizer::GetTokenAllocator(void)
{
  return mTokenAllocator;
}


/**
 * This method provides access to the topmost token in the tokenDeque.
 * The token is not really removed from the list.
 *
 * @return Pointer to token
 */
CToken* nsHTMLTokenizer::PeekToken()
{
  return (CToken*)mTokenDeque.PeekFront();
}


/**
 * This method provides access to the topmost token in the tokenDeque.
 * The token is really removed from the list; if the list is empty we return 0.
 *
 * @return Pointer to token or NULL
 */
CToken* nsHTMLTokenizer::PopToken()
{
  CToken* result=nsnull;
  result=(CToken*)mTokenDeque.PopFront();
  return result;
}


/**
 * Pushes a token onto the front of our deque such that the next call to
 * PopToken() or PeekToken() will return that token.
 *
 * @param theToken The next token to be processed
 * @return theToken
 */
CToken* nsHTMLTokenizer::PushTokenFront(CToken* theToken)
{
  mTokenDeque.PushFront(theToken);
  return theToken;
}

/**
 * Pushes a token onto the deque.
 *
 * @param theToken the new token.
 * @return theToken
 */
CToken* nsHTMLTokenizer::PushToken(CToken* theToken)
{
  mTokenDeque.Push(theToken);
  return theToken;
}

/**
 * Returns the size of the deque.
 *
 * @return The number of remaining tokens.
 */
PRInt32 nsHTMLTokenizer::GetCount(void)
{
  return mTokenDeque.GetSize();
}

/**
 * Allows access to an arbitrary token in the deque. The accessed token is left
 * in the deque.
 *
 * @param anIndex The index of the target token. Token 0 would be the same as
 *                the result of a call to PeekToken()
 * @return The requested token.
 */
CToken* nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex)
{
  return (CToken*)mTokenDeque.ObjectAt(anIndex);
}

/**
 * This method is part of the "sandwich" that occurs when we want to tokenize
 * a document. This prepares us to be able to tokenize properly.
 *
 * @param aIsFinalChunk Whether this is the last chunk of data that we will
 *                      get to see.
 * @param aTokenAllocator The token allocator to use for this document.
 * @return Our success in setting up.
 */
nsresult nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,
                                       nsTokenAllocator* aTokenAllocator)
{
  mTokenAllocator=aTokenAllocator;
  mIsFinalChunk=aIsFinalChunk;
  // Cause ScanDocStructure to search from here for new tokens...
  mTokenScanPos=mTokenDeque.GetSize();
  return NS_OK;
}

/**
 * Pushes all of the tokens in aDeque onto the front of our deque so they
 * get processed before any other tokens.
 *
 * @param aDeque The deque with the tokens in it.
 */
void nsHTMLTokenizer::PrependTokens(nsDeque& aDeque)
{
  PRInt32 aCount=aDeque.GetSize();

  PRInt32 anIndex=0;
  for(anIndex=0;anIndex<aCount;++anIndex){
    CToken* theToken=(CToken*)aDeque.Pop();
    PushTokenFront(theToken);
  }

}

/**
 * Copies the state flags from aTokenizer into this tokenizer. This is used
 * to pass information around between the main tokenizer and tokenizers
 * created for document.write() calls.
 *
 * @param aTokenizer The tokenizer with more information in it.
 * @return NS_OK
 */
nsresult nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
{
  if (aTokenizer) {
    mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags;
  }

  return NS_OK;
}

/**
 * This is a utilty method for ScanDocStructure, which finds a given
 * tag in the stack. The return value is meant to be used with
 * nsDeque::ObjectAt() on aTagStack.
 *
 * @param   aTag -- the ID of the tag we're seeking
 * @param   aTagStack -- the stack to be searched
 * @return  index position of tag in stack if found, otherwise kNotFound
 */
static PRInt32 FindLastIndexOfTag(eHTMLTags aTag,nsDeque &aTagStack)
{
  PRInt32 theCount=aTagStack.GetSize();

  while(0<theCount) {
    CHTMLToken *theToken=(CHTMLToken*)aTagStack.ObjectAt(--theCount);
    if(theToken) {
      eHTMLTags  theTag=(eHTMLTags)theToken->GetTypeID();
      if(theTag==aTag) {
        return theCount;
      }
    }
  }

  return kNotFound;
}

/**
 * This method scans the sequence of tokens to determine whether or not the
 * tag structure of the document is well formed. In well formed cases, we can
 * skip doing residual style handling and allow inlines to contain block-level
 * elements.
 *
 * @param aFinalChunk Is unused.
 * @return Success (currently, this function cannot fail).
 */
nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk)
{
  nsresult result = NS_OK;
  if (!mTokenDeque.GetSize())
    return result;

  CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);

  // Start by finding the first start tag that hasn't been reviewed.
  while(mTokenScanPos > 0) {
    if(theToken) {
      eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
      if(eToken_start == theType) {
        if(eFormUnknown == theToken->GetContainerInfo()) {
          break;
        }
      }
    }
    theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos);
  }

  // Now that we know where to start, let's walk through the
  // tokens to see which are well-formed. Stop when you run out
  // of fresh tokens.

  nsDeque       theStack(0);
  nsDeque       tempStack(0);
  PRInt32       theStackDepth = 0;
  // Don't bother if we get ridiculously deep.
  static  const PRInt32 theMaxStackDepth = 200;

  while(theToken && theStackDepth < theMaxStackDepth) {
    eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
    eHTMLTags       theTag  = (eHTMLTags)theToken->GetTypeID();

    if(nsHTMLElement::IsContainer(theTag)) { // Bug 54117
      PRBool theTagIsBlock  = gHTMLElements[theTag].IsMemberOf(kBlockEntity);
      PRBool theTagIsInline = (theTagIsBlock) ?
                                PR_FALSE :
                                gHTMLElements[theTag].IsMemberOf(kInlineEntity);

      if(theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) {
        switch(theType) {
          case eToken_start:
            {
              if (gHTMLElements[theTag].ShouldVerifyHierarchy()) {
                PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack);
                if (earlyPos != kNotFound) {
                  // Uh-oh, we've found a tag that is not allowed to nest at
                  // all. Mark the previous one and all of its children as
                  // malformed to increase our chances of doing RS handling
                  // on all of them. We want to do this for cases such as:
                  // <a><div><a></a></div></a>.
                  // Note that we have to iterate through all of the chilren
                  // of the original malformed tag to protect against:
                  // <a><font><div><a></a></div></font></a>, so that the <font>
                  // is allowed to contain the <div>.
                  // XXX What about <a><span><a>, where the second <a> closes
                  // the <span>?
                  nsDequeIterator it(theStack, earlyPos), end(theStack.End());
                  while (it < end) {
                    CHTMLToken *theMalformedToken =
                        NS_STATIC_CAST(CHTMLToken*, it++);

                    theMalformedToken->SetContainerInfo(eMalformed);
                  }
                }
              }

              theStack.Push(theToken);
              ++theStackDepth;
            }
            break;
          case eToken_end:
            {
              CHTMLToken *theLastToken = NS_STATIC_CAST(CHTMLToken*, theStack.Peek());
              if(theLastToken) {
                if(theTag == theLastToken->GetTypeID()) {
                  theStack.Pop(); // Yank it for real
                  theStackDepth--;
                  theLastToken->SetContainerInfo(eWellFormed);
                }
                else {
                  // This token wasn't what we expected it to be! We need to
                  // go searching for its real start tag on our stack. Each
                  // tag in between the end tag and start tag must be malformed

                  if(FindLastIndexOfTag(theTag, theStack) != kNotFound) {
                    // Find theTarget in the stack, marking each (malformed!)
                    // tag in our way.
                    theStack.Pop(); // pop off theLastToken for real.
                    do {
                      theLastToken->SetContainerInfo(eMalformed);
                      tempStack.Push(theLastToken);
                      theLastToken = NS_STATIC_CAST(CHTMLToken*, theStack.Pop());
                    } while(theLastToken && theTag != theLastToken->GetTypeID());
                    // XXX The above test can confuse two different userdefined
                    // tags.

                    NS_ASSERTION(theLastToken,
                                 "FindLastIndexOfTag lied to us!"
                                 " We couldn't find theTag on theStack");
                    theLastToken->SetContainerInfo(eMalformed);

                    // Great, now push all of the other tokens back onto the
                    // stack to preserve the general structure of the document.
                    // Note that we don't push the target token back onto the
                    // the stack (since it was just closed).
                    while(tempStack.GetSize() != 0) {
                      theStack.Push(tempStack.Pop());
                    }
                  } // else ignore a bogus end tag.
                }
              } // if (theLastToken)
            }
            break;
          default:
            break;
        }
      }
    }

    theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
  }

  return result;
}

/**
 * This method is called after we're done tokenizing a chunk of data.
 *
 * @param aFinalChunk Tells us if this was the last chunk of data.
 * @return Error result.
 */
nsresult nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk)
{
  return ScanDocStructure(aFinalChunk);
}

/**
 * This method is repeatedly called by the tokenizer.
 * Each time, we determine the kind of token we're about to
 * read, and then we call the appropriate method to handle
 * that token type.
 *
 * @param  aScanner The source of our input.
 * @param  aFlushTokens An OUT parameter to tell the caller whether it should
 *                      process our queued tokens up to now (e.g., when we
 *                      reach a <script>).
 * @return Success or error
 */
nsresult nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner,PRBool& aFlushTokens)
{
  PRUnichar theChar;
  CToken* theToken=0;

  nsresult result=aScanner.Peek(theChar);

  switch(result) {
    case kEOF:
      // Tell our caller that'we finished.
      return result;

    case NS_OK:
    default:

      if(!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
        if(kLessThan==theChar) {
          return ConsumeTag(theChar,theToken,aScanner,aFlushTokens);
        }
        else if(kAmpersand==theChar){
          return ConsumeEntity(theChar,theToken,aScanner);
        }
      }

      if((kCR==theChar) || (kLF==theChar)) {
        return ConsumeNewline(theChar,theToken,aScanner);
      }
      else {
        if(!nsCRT::IsAsciiSpace(theChar)) {
          if(theChar!=nsnull) {
            result=ConsumeText(theToken,aScanner);
          }
          else {
            aScanner.GetChar(theChar); // skip the embedded null char. Fix bug 64098.
          }
          break;
        }
        result=ConsumeWhitespace(theChar,theToken,aScanner);
      }
      break;
  }

  return result;
}


/**
 * This method is called just after a "<" has been consumed
 * and we know we're at the start of some kind of tagged
 * element. We don't know yet if it's a tag or a comment.
 *
 * @param   aChar is the last char read
 * @param   aToken is the out arg holding our new token (the function allocates
 *                 the return token using mTokenAllocator).
 * @param   aScanner represents our input source
 * @param   aFlushTokens is an OUT parameter use to tell consumers to flush
 *                       the current tokens after processing the current one.
 * @return  error code.
 */
nsresult nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,
                                     CToken*& aToken,
                                     nsScanner& aScanner,
                                     PRBool& aFlushTokens)
{
  PRUnichar theNextChar, oldChar;
  nsresult result=aScanner.Peek(aChar,1);

  if(NS_OK==result) {

    switch(aChar) {
      case kForwardSlash:
        result=aScanner.Peek(theNextChar, 2);

        if(NS_OK==result) {
          // Get the original "<" (we've already seen it with a Peek)
          aScanner.GetChar(oldChar);

          // XML allows non ASCII tag names, consume this as an end tag. This
          // is needed to make XML view source work
          PRBool isXML=(mFlags & NS_IPARSER_FLAG_XML);
          if(nsCRT::IsAsciiAlpha(theNextChar)||(kGreaterThan==theNextChar)||
             (isXML && (! nsCRT::IsAscii(theNextChar)))) {
            result=ConsumeEndTag(aChar,aToken,aScanner);
          }
          else result=ConsumeComment(aChar,aToken,aScanner);
        }

        break;

      case kExclamation:
        result=aScanner.Peek(theNextChar, 2);

        if(NS_OK==result) {
          // Get the original "<" (we've already seen it with a Peek)
          aScanner.GetChar(oldChar);

          if((kMinus==theNextChar) || (kGreaterThan==theNextChar)) {
            result=ConsumeComment(aChar,aToken,aScanner);
          }
          else
            result=ConsumeSpecialMarkup(aChar,aToken,aScanner);
        }
        break;

      case kQuestionMark: // It must be an XML processing instruction...
        // Get the original "<" (we've already seen it with a Peek)
        aScanner.GetChar(oldChar);
        result=ConsumeProcessingInstruction(aChar,aToken,aScanner);
        break;

      default:
        // XML allows non ASCII tag names, consume this as a start tag.
        PRBool isXML=(mFlags & NS_IPARSER_FLAG_XML);
        if(nsCRT::IsAsciiAlpha(aChar) ||
            (isXML && (! nsCRT::IsAscii(aChar)))) {
          // Get the original "<" (we've already seen it with a Peek)
          aScanner.GetChar(oldChar);
          result=ConsumeStartTag(aChar,aToken,aScanner,aFlushTokens);
        }
        else {
          // We are not dealing with a tag. So, don't consume the original
          // char and leave the decision to ConsumeText().
          result=ConsumeText(aToken,aScanner);
        }
    }
  }

  // Last ditch attempt to make sure we don't lose data.
  if (kEOF == result && !aScanner.IsIncremental()) {
    // Whoops, we don't want to lose any data! Consume the rest as text.
    // This normally happens for either a trailing < or </
    result = ConsumeText(aToken,aScanner);
  }

  return result;
}

/**
 * This method is called just after we've consumed a start or end
 * tag, and we now have to consume its attributes.
 *
 * @param   aChar is the last char read
 * @param   aToken is the start or end tag that "owns" these attributes.
 * @param   aScanner represents our input source
 * @return  Error result.
 */
nsresult nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
                                            CToken* aToken,
                                            nsScanner& aScanner)
{
  PRBool done=PR_FALSE;
  nsresult result=NS_OK;
  PRInt16 theAttrCount=0;

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();

  while((!done) && (result==NS_OK)) {
    CAttributeToken* theToken =
      NS_STATIC_CAST(CAttributeToken*,
                     theAllocator->CreateTokenOfType(eToken_attribute,
                                                     eHTMLTag_unknown));
    if(theToken){
      // Tell the new token to finish consuming text...
      result=theToken->Consume(aChar,aScanner,mFlags);

      // Much as I hate to do this, here's some special case code.
      // This handles the case of empty-tags in XML. Our last
      // attribute token will come through with a text value of ""
      // and a textkey of "/". We should destroy it.
      if(NS_SUCCEEDED(result)) {
        PRBool isUsableAttr = PR_TRUE;
        const nsSubstring& key=theToken->GetKey();
        const nsAString& text=theToken->GetValue();

        if(!key.IsEmpty() && kForwardSlash==key.First() && text.IsEmpty()) {
          if(!(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
            // We only care about these in view-source.
            isUsableAttr = PR_FALSE;
          }
        }
        if(isUsableAttr) {
          ++theAttrCount;
          AddToken((CToken*&)theToken,result,&mTokenDeque,theAllocator);
        }
        else {
          IF_FREE(theToken, mTokenAllocator);
        }
      }
      else {
        IF_FREE(theToken, mTokenAllocator);
        // Bad attributes are not a reason to set empty.
        if(NS_ERROR_HTMLPARSER_BADATTRIBUTE==result) {
          result=NS_OK;
        } else {
          aToken->SetEmpty(PR_TRUE);
        }
      }
    }

#ifdef DEBUG
    if(NS_SUCCEEDED(result)){
      PRInt32 newline = 0;
      result = aScanner.SkipWhitespace(newline);
      NS_ASSERTION(newline == 0, "CAttribute::Consume() failed to collect all the newlines!");
    }
#endif
    if (NS_SUCCEEDED(result)) {
      result = aScanner.Peek(aChar);
      if (NS_SUCCEEDED(result)) {
        if (aChar == kGreaterThan) { // You just ate the '>'
          aScanner.GetChar(aChar); // Skip the '>'
          done = PR_TRUE;
        }
        else if(aChar == kLessThan) {
          aToken->SetInError(PR_TRUE);
          done = PR_TRUE;
        }
      }
    }
  } // End while

  if (NS_FAILED(result)) {
    aToken->SetInError(PR_TRUE);

    if (!aScanner.IsIncremental()) {
      result = NS_OK;
    }
  }

  aToken->SetAttributeCount(theAttrCount);
  return result;
}

/**
 * This method consumes a start tag and all of its attributes.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token. (allocated
 *               by the function using mTokenAllocator
 * @param aScanner Our source of data
 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
 *                     the current tokens after processing the current one.
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,
                                          CToken*& aToken,
                                          nsScanner& aScanner,
                                          PRBool& aFlushTokens)
{
  // Remember this for later in case you have to unwind...
  PRInt32 theDequeSize=mTokenDeque.GetSize();
  nsresult result=NS_OK;

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  aToken=theAllocator->CreateTokenOfType(eToken_start,eHTMLTag_unknown);

  if(aToken) {
    // Tell the new token to finish consuming text...
    result= aToken->Consume(aChar,aScanner,mFlags);

    if(NS_SUCCEEDED(result)) {
      AddToken(aToken,result,&mTokenDeque,theAllocator);

      eHTMLTags theTag=(eHTMLTags)aToken->GetTypeID();

      // Good. Now, let's see if the next char is ">".
      // If so, we have a complete tag, otherwise, we have attributes.
      result = aScanner.Peek(aChar);
      if (NS_FAILED(result)) {
        aToken->SetInError(PR_TRUE);

        // Don't return early here so we can create a text and end token for
        // the special <iframe>, <script> and similar tags down below.
        result = NS_OK;
      }
      else {
        if(kGreaterThan != aChar) { // Look for a '>'
          result = ConsumeAttributes(aChar, aToken, aScanner);
        }
        else {
          aScanner.GetChar(aChar);
        }
      }

      /*  Now that that's over with, we have one more problem to solve.
          In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
          consume all the content itself.
          But XML doesn't treat these tags differently, so we shouldn't if the
          document is XML.
       */
      if(NS_SUCCEEDED(result) && !(mFlags & NS_IPARSER_FLAG_XML)) {
        PRBool isCDATA = gHTMLElements[theTag].CanContainType(kCDATA);
        PRBool isPCDATA = eHTMLTag_textarea == theTag ||
                          eHTMLTag_title    == theTag;

        // XXX This is an evil hack, we should be able to handle these properly
        // in the DTD.
        if ((eHTMLTag_iframe == theTag   && (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
            (eHTMLTag_noframes == theTag && (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
            (eHTMLTag_noscript == theTag && (mFlags & NS_IPARSER_FLAG_SCRIPT_ENABLED)) ||
            (eHTMLTag_noembed == theTag)) {
          isCDATA = PR_TRUE;
        }

        // Plaintext contains CDATA, but it's special, so we handle it
        // differently than the other CDATA elements
        if (eHTMLTag_plaintext == theTag) {
          isCDATA = PR_FALSE;

          // Note: We check in ConsumeToken() for this flag, and if we see it
          // we only construct text tokens (which is what we want).
          mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
        }


        if (isCDATA || isPCDATA) {
          PRBool done = PR_FALSE;
          nsDependentString endTagName(nsHTMLTags::GetStringValue(theTag));

          CToken* text =
              theAllocator->CreateTokenOfType(eToken_text,eHTMLTag_text);
          CTextToken* textToken = NS_STATIC_CAST(CTextToken*,text);

          if (isCDATA) {
            // The only tags that consume conservatively are <script> and
            // <style>, the rest all consume until the end of the document.
            result = textToken->ConsumeCharacterData(theTag==eHTMLTag_script ||
                                                     theTag==eHTMLTag_style,
                                                     theTag!=eHTMLTag_script,
                                                     aScanner,
                                                     endTagName,
                                                     mFlags,
                                                     done);

            // Only flush tokens for <script>, to give ourselves more of a
            // chance of allowing inlines to contain blocks.
            aFlushTokens = done && theTag == eHTMLTag_script;
          }
          else if (isPCDATA) {
            // Title is consumed conservatively in order to not regress
            // bug 42945
            result = textToken->ConsumeParsedCharacterData(
                                                        theTag==eHTMLTag_textarea,
                                                        theTag==eHTMLTag_title,
                                                        aScanner,
                                                        endTagName,
                                                        mFlags,
                                                        done);

            // Note: we *don't* set aFlushTokens here.
          }

          // We want to do this unless result is kEOF, in which case we will
          // simply unwind our stack and wait for more data anyway.
          if (kEOF != result) {
            AddToken(text,NS_OK,&mTokenDeque,theAllocator);
            CToken* endToken = nsnull;

            if (NS_SUCCEEDED(result) && done) {
              PRUnichar theChar;
              // Get the <
              result = aScanner.GetChar(theChar);
              NS_ASSERTION(NS_SUCCEEDED(result) && theChar == kLessThan,
                           "CTextToken::Consume*Data is broken!");
#ifdef DEBUG
              // Ensure we have a /
              PRUnichar tempChar;  // Don't change non-debug vars in debug-only code
              result = aScanner.Peek(tempChar);
              NS_ASSERTION(NS_SUCCEEDED(result) && tempChar == kForwardSlash,
                           "CTextToken::Consume*Data is broken!");
#endif
              result = ConsumeEndTag(PRUnichar('/'),endToken,aScanner);
            } else if (result == kFakeEndTag &&
                      !(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
              result = NS_OK;
              endToken=theAllocator->CreateTokenOfType(eToken_end,theTag,endTagName);
              AddToken(endToken,result,&mTokenDeque,theAllocator);
            } else if (result == kFakeEndTag) {
              // If we are here, we are both faking having seen the end tag
              // and are in view-source.
              result = NS_OK;
            }
          }
          else {
            IF_FREE(text, mTokenAllocator);
          }
        }
      }

      // This code is confusing, so pay attention.
      // If you're here, it's because we were in the midst of consuming a start
      // tag but ran out of data (not in the stream, but in this *part* of the
      // stream. For simplicity, we have to unwind our input. Therefore, we pop
      // and discard any new tokens we've cued this round. Later we can get
      // smarter about this.
      if(NS_FAILED(result)) {
        while(mTokenDeque.GetSize()>theDequeSize) {
          CToken* theToken=(CToken*)mTokenDeque.Pop();
          IF_FREE(theToken, mTokenAllocator);
        }
      }
    }
    else IF_FREE(aToken, mTokenAllocator);
  }
  return result;
}

/**
 * This method consumes an end tag and any "attributes" that may come after it.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result
 */
nsresult nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,
                                        CToken*& aToken,
                                        nsScanner& aScanner)
{
  // Get the "/" (we've already seen it with a Peek)
  aScanner.GetChar(aChar);

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  aToken=theAllocator->CreateTokenOfType(eToken_end,eHTMLTag_unknown);
  // Remember this for later in case you have to unwind...
  PRInt32 theDequeSize=mTokenDeque.GetSize();
  nsresult result=NS_OK;

  if(aToken) {
    // Tell the new token to finish consuming text...
    result= aToken->Consume(aChar,aScanner,mFlags);
    AddToken(aToken,result,&mTokenDeque,theAllocator);
    if (NS_FAILED(result)) {
      // Note that this early-return here is safe because we have not yet
      // added any of our tokens to the queue (AddToken only adds the token if
      // result is a success), so we don't need to fall through.
      return result;
    }

    result = aScanner.Peek(aChar);
    if (NS_FAILED(result)) {
      aToken->SetInError(PR_TRUE);

      // Note: We know here that the scanner is not incremental since if
      // this peek fails, then we've already masked over a kEOF coming from
      // the Consume() call above.
      return NS_OK;
    }

    if(kGreaterThan != aChar) {
      result = ConsumeAttributes(aChar, aToken, aScanner);
    }
    else {
      aScanner.GetChar(aChar);
    }

    // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
    // out of room in this *section* of the document, pop all of the tokens
    // we've consumed this round and wait for more data.
    if(NS_FAILED(result)) {
      while(mTokenDeque.GetSize()>theDequeSize) {
        CToken* theToken=(CToken*)mTokenDeque.Pop();
        IF_FREE(theToken, mTokenAllocator);
      }
    }
  }
  return result;
}

/**
 *  This method is called just after a "&" has been consumed
 *  and we know we're at the start of an entity.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,
                                        CToken*& aToken,
                                        nsScanner& aScanner)
{
  PRUnichar  theChar;
  nsresult result=aScanner.Peek(theChar, 1);

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  if (NS_SUCCEEDED(result)) {
    if (nsCRT::IsAsciiAlpha(theChar) || theChar==kHashsign) {
      aToken = theAllocator->CreateTokenOfType(eToken_entity,eHTMLTag_entity);
      result=aToken->Consume(theChar,aScanner,mFlags);

      if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
        IF_FREE(aToken, mTokenAllocator);
      }
      else {
        if (!aScanner.IsIncremental() && result == kEOF) {
          result=NS_OK; // Use as much of the entity as you can get.
        }
        AddToken(aToken,result,&mTokenDeque,theAllocator);
        return result;
      }
    }
    // Oops, we're actually looking at plain text...
    result = ConsumeText(aToken,aScanner);
  }
  else if (result == kEOF && !aScanner.IsIncremental()) {
    // If the last character in the file is an &, consume it as text.
    result = ConsumeText(aToken, aScanner);
    if (aToken) {
      aToken->SetInError(PR_TRUE);
    }
  }
  return result;
}


/**
 *  This method is called just after whitespace has been
 *  consumed and we know we're at the start a whitespace run.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,
                                            CToken*& aToken,
                                            nsScanner& aScanner)
{
  // Get the whitespace character
  aScanner.GetChar(aChar);

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  aToken = theAllocator->CreateTokenOfType(eToken_whitespace,eHTMLTag_whitespace);
  nsresult result=NS_OK;
  if(aToken) {
    result=aToken->Consume(aChar,aScanner,mFlags);
    AddToken(aToken,result,&mTokenDeque,theAllocator);
  }
  return result;
}

/**
 *  This method is called just after a "<!" has been consumed
 *  and we know we're at the start of a comment.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,
                                         CToken*& aToken,
                                         nsScanner& aScanner)
{
  // Get the "!"
  aScanner.GetChar(aChar);

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  aToken = theAllocator->CreateTokenOfType(eToken_comment,eHTMLTag_comment);
  nsresult result=NS_OK;
  if(aToken) {
    result=aToken->Consume(aChar,aScanner,mFlags);
    AddToken(aToken,result,&mTokenDeque,theAllocator);
  }

  if (kNotAComment == result) {
    // AddToken has IF_FREE()'d our token, so...
    result = ConsumeText(aToken, aScanner);
  }

  return result;
}

/**
 * This method is called just after a known text char has
 * been consumed and we should read a text run. Note: we actually ignore the
 * first character of the text run so that we can consume invalid markup
 * as text.
 *
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeText(CToken*& aToken,nsScanner& aScanner)
{
  nsresult result=NS_OK;
  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  CTextToken* theToken = (CTextToken*)theAllocator->CreateTokenOfType(eToken_text,eHTMLTag_text);
  if(theToken) {
    PRUnichar ch=0;
    result=theToken->Consume(ch,aScanner,mFlags);
    if(NS_FAILED(result)) {
      if(0==theToken->GetTextLength()){
        IF_FREE(aToken, mTokenAllocator);
        aToken = nsnull;
      }
      else result=NS_OK;
    }
    aToken = theToken;
    AddToken(aToken,result,&mTokenDeque,theAllocator);
  }
  return result;
}

/**
 * This method is called just after a "<!" has been consumed.
 * NOTE: Here we might consume DOCTYPE and "special" markups.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,
                                               CToken*& aToken,
                                               nsScanner& aScanner)
{
  // Get the "!"
  aScanner.GetChar(aChar);

  nsresult result=NS_OK;
  nsAutoString theBufCopy;
  aScanner.Peek(theBufCopy, 20);
  ToUpperCase(theBufCopy);
  PRInt32 theIndex=theBufCopy.Find("DOCTYPE");
  nsTokenAllocator* theAllocator=this->GetTokenAllocator();

  if(theIndex==kNotFound) {
    if('['==theBufCopy.CharAt(0)) {
      aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,eHTMLTag_comment);
    } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
               StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) ||
               StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY")) ||
               StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
      aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,eHTMLTag_markupDecl);
    } else {
      aToken = theAllocator->CreateTokenOfType(eToken_comment,eHTMLTag_comment);
    }
  }
  else
    aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,eHTMLTag_doctypeDecl);

  if(aToken) {
    result=aToken->Consume(aChar,aScanner,mFlags);
    AddToken(aToken,result,&mTokenDeque,theAllocator);
  }

  if (result == kNotAComment) {
    result = ConsumeText(aToken, aScanner);
  }

  return result;
}

/**
 * This method is called just after a newline has been consumed.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,
                                         CToken*& aToken,
                                         nsScanner& aScanner)
{
  // Get the newline character
  aScanner.GetChar(aChar);

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  aToken=theAllocator->CreateTokenOfType(eToken_newline,eHTMLTag_newline);
  nsresult result=NS_OK;
  if(aToken) {
    result=aToken->Consume(aChar,aScanner,mFlags);
    AddToken(aToken,result,&mTokenDeque,theAllocator);
  }
  return result;
}


/**
 * This method is called just after a <? has been consumed.
 *
 * @param aChar The last character read from the scanner.
 * @param aToken The OUT parameter that holds our resulting token.
 * @param aScanner Our source of data
 * @return Error result.
 */
nsresult nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,
                                                       CToken*& aToken,
                                                       nsScanner& aScanner)
{
  // Get the "?"
  aScanner.GetChar(aChar);

  nsTokenAllocator* theAllocator=this->GetTokenAllocator();
  aToken=theAllocator->CreateTokenOfType(eToken_instruction,eHTMLTag_unknown);
  nsresult result=NS_OK;
  if(aToken) {
    result=aToken->Consume(aChar,aScanner,mFlags);
    AddToken(aToken,result,&mTokenDeque,theAllocator);
  }
  return result;
}