Mozilla/mozilla/parser/htmlparser/src/nsDTDUtils.h

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: NPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Netscape Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the NPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the NPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */


/**
 * MODULE NOTES:
 * @update  gess 4/1/98
 *
 */


#ifndef DTDUTILS_
#define DTDUTILS_

#include "nsHTMLTags.h"
#include "nsHTMLTokens.h"
#include "nsIParser.h"
#include "nsCRT.h"
#include "nsDeque.h"
#include "nsIDTD.h"
#include "nsITokenizer.h"
#include "nsString.h"
#include "nsIParserNode.h"
#include "nsFixedSizeAllocator.h"
#include "nsVoidArray.h"
#include "nsIParserService.h"

#define IF_HOLD(_ptr) \
 PR_BEGIN_MACRO       \
 if(_ptr) {           \
   _ptr->AddRef();    \
 }                    \
 PR_END_MACRO

// recycles _ptr
#define IF_FREE(_ptr, _allocator)                \
  PR_BEGIN_MACRO                                 \
  if(_ptr) {                                     \
    _ptr->Release((_allocator)->GetArenaPool()); \
    _ptr=0;                                      \
  }                                              \
  PR_END_MACRO

// release objects and destroy _ptr
#define IF_DELETE(_ptr, _allocator) \
  PR_BEGIN_MACRO                    \
  if(_ptr) {                        \
    _ptr->ReleaseAll(_allocator);   \
    delete(_ptr);                   \
    _ptr=0;                         \
  }                                 \
  PR_END_MACRO

class nsIParserNode;
class nsCParserNode;
class nsNodeAllocator;


#ifdef DEBUG
void DebugDumpContainmentRules(nsIDTD& theDTD,const char* aFilename,const char* aTitle);
void DebugDumpContainmentRules2(nsIDTD& theDTD,const char* aFilename,const char* aTitle);
#endif
PRUint32 AccumulateCRC(PRUint32 crc_accum, char *data_blk_ptr, int data_blk_size);


/***************************************************************
  First, define the tagstack class
 ***************************************************************/

class nsEntryStack;  //forware declare to make compilers happy.

struct nsTagEntry {
  eHTMLTags       mTag;  //for speedier access to tag id
  nsCParserNode*  mNode;
  nsEntryStack*   mParent;
  nsEntryStack*   mStyles;
};

class nsEntryStack {

public:
                  nsEntryStack();
                  ~nsEntryStack();

  void            EnsureCapacityFor(PRInt32 aNewMax, PRInt32 aShiftOffset=0);
  void            Push(const nsCParserNode* aNode,nsEntryStack* aStyleStack=0);
  void            PushFront(const nsCParserNode* aNode,nsEntryStack* aStyleStack=0);
  void            Append(nsEntryStack *theStack);
  nsCParserNode*  Pop(void);
  nsCParserNode*  Remove(PRInt32 anIndex,eHTMLTags aTag);
  nsCParserNode*  NodeAt(PRInt32 anIndex) const;
  eHTMLTags       First() const;
  eHTMLTags       TagAt(PRInt32 anIndex) const;
  nsTagEntry*     EntryAt(PRInt32 anIndex) const;
  eHTMLTags       operator[](PRInt32 anIndex) const;
  eHTMLTags       Last() const;
  void            Empty(void);

  /*
   * Release all objects in the entry stack
   */
  void ReleaseAll(nsNodeAllocator* aNodeAllocator);

  /**
   * Find the first instance of given tag on the stack.
   * @update	gess 12/14/99
   * @param   aTag
   * @return  index of tag, or kNotFound if not found
   */
  inline PRInt32 FirstOf(eHTMLTags aTag) const {
    PRInt32 index=-1;

    if(0<mCount) {
      while(++index<mCount) {
        if(aTag==mEntries[index].mTag) {
          return index;
        }
      } //while
    }
    return kNotFound;
  }


  /**
   * Find the last instance of given tag on the stack.
   * @update	gess 12/14/99
   * @param   aTag
   * @return  index of tag, or kNotFound if not found
   */
  inline PRInt32 LastOf(eHTMLTags aTag) const {
    PRInt32 index=mCount;
    while(--index>=0) {
        if(aTag==mEntries[index].mTag) {
          return index;
        }
    }
    return kNotFound;
  }

  nsTagEntry* mEntries;
  PRInt32    mCount;
  PRInt32    mCapacity;
};


/**********************************************************
  The table state class is used to store info about each
  table that is opened on the stack. As tables open and
  close on the context, we update these objects to track
  what has/hasn't been seen on a per table basis.
 **********************************************************/
class CTableState {
public:
  CTableState(CTableState *aPreviousState=0) {
    mHasCaption=PR_FALSE;
    mHasCols=PR_FALSE;
    mHasTHead=PR_FALSE;
    mHasTFoot=PR_FALSE;
    mHasTBody=PR_FALSE;
    mPrevious=aPreviousState;
  }

  PRBool  CanOpenCaption() {
    PRBool result=!(mHasCaption || mHasCols || mHasTHead || mHasTFoot || mHasTBody);
    return result;
  }

  PRBool  CanOpenCols() {
    PRBool result=!(mHasCols || mHasTHead || mHasTFoot || mHasTBody);
    return result;
  }

  PRBool  CanOpenTBody() {
    PRBool result=!(mHasTBody);
    return result;
  }

  PRBool  CanOpenTHead() {
    PRBool result=!(mHasTHead || mHasTFoot || mHasTBody);
    return result;
  }

  PRBool  CanOpenTFoot() {
    PRBool result=!(mHasTFoot || mHasTBody);
    return result;
  }

  PRBool  mHasCaption;
  PRBool  mHasCols;
  PRBool  mHasTHead;
  PRBool  mHasTFoot;
  PRBool  mHasTBody;
  CTableState *mPrevious;
};

#ifdef DEBUG
//used for named entities and counters (XXX debug only)
class CNamedEntity {
public:
  CNamedEntity(const nsAString& aName,const nsAString& aValue) : mName(), mValue() {
    PRUnichar theFirst=aName.First();
    PRUnichar theLast=aName.Last();
    PRInt32   theLen=aName.Length();
    if((2<theLen) && (theFirst==theLast) && (kQuote==theFirst)) {
      mName = Substring(aName, 1, theLen - 2);
    }
    else mName=aName;

    theFirst=aValue.First();
    theLast=aValue.Last();
    theLen=aValue.Length();
    if((2<theLen) && (theFirst==theLast) && (kQuote==theFirst)) {
      mValue = Substring(aValue, 1, theLen - 2);
    }
    else mValue=aValue;

  }

  nsString mName;
  nsString mValue;
  PRInt32  mOrdinal;
};
#endif
/************************************************************************
  nsTokenAllocator class implementation.
  This class is used to recycle tokens.
  By using this simple class, we cut WAY down on the number of tokens
  that get created during the run of the system.

  Note: The allocator is created per document. It's been shared
        ( but not ref. counted ) by objects, tokenizer,dtd,and dtd context,
        that cease to exist when the document is destroyed.
 ************************************************************************/
class nsTokenAllocator {
public:

                  nsTokenAllocator();
  virtual         ~nsTokenAllocator();
  virtual CToken* CreateTokenOfType(eHTMLTokenTypes aType,eHTMLTags aTag, const nsAString& aString);
  virtual CToken* CreateTokenOfType(eHTMLTokenTypes aType,eHTMLTags aTag);

  nsFixedSizeAllocator& GetArenaPool() { return mArenaPool; }

protected:
    nsFixedSizeAllocator mArenaPool;


#ifdef  NS_DEBUG
    int       mTotals[eToken_last-1];
#endif
};

/************************************************************************
  CNodeRecycler class implementation.
  This class is used to recycle nodes.
  By using this simple class, we cut down on the number of nodes
  that get created during the run of the system.
 ************************************************************************/

#ifndef HEAP_ALLOCATED_NODES
class nsCParserNode;
#endif

class nsNodeAllocator {
public:

                         nsNodeAllocator();
  virtual                ~nsNodeAllocator();
  virtual nsCParserNode* CreateNode(CToken* aToken=nsnull, nsTokenAllocator* aTokenAllocator=0);

  nsFixedSizeAllocator&  GetArenaPool() { return mNodePool; }

#ifdef HEAP_ALLOCATED_NODES
  void Recycle(nsCParserNode* aNode) { mSharedNodes.Push(NS_STATIC_CAST(void*,aNode)); }
protected:
  nsDeque mSharedNodes;
#ifdef DEBUG_TRACK_NODES
  PRInt32 mCount;
#endif
#endif

protected:
  nsFixedSizeAllocator mNodePool;
};

/************************************************************************
  The dtdcontext class defines an ordered list of tags (a context).
 ************************************************************************/

class nsDTDContext {
public:
                  nsDTDContext();
                  ~nsDTDContext();

  void            Push(const nsCParserNode* aNode,nsEntryStack* aStyleStack=0);
  nsCParserNode*  Pop(nsEntryStack*& aChildStack);
  nsCParserNode*  Pop();
  nsCParserNode*  PeekNode() { return mStack.NodeAt(mStack.mCount-1); }
  eHTMLTags       First(void) const;
  eHTMLTags       Last(void) const;
  nsTagEntry*     LastEntry(void) const;
  eHTMLTags       TagAt(PRInt32 anIndex) const;
  eHTMLTags       operator[](PRInt32 anIndex) const {return TagAt(anIndex);}
  PRBool          HasOpenContainer(eHTMLTags aTag) const;
  PRInt32         FirstOf(eHTMLTags aTag) const {return mStack.FirstOf(aTag);}
  PRInt32         LastOf(eHTMLTags aTag) const {return mStack.LastOf(aTag);}

  void            Empty(void);
  PRInt32         GetCount(void) {return mStack.mCount;}
  PRInt32         GetResidualStyleCount(void) {return mResidualStyleCount;}
  nsEntryStack*   GetStylesAt(PRInt32 anIndex) const;
  void            PushStyle(const nsCParserNode* aNode);
  void            PushStyles(nsEntryStack *theStyles);
  nsCParserNode*  PopStyle(void);
  nsCParserNode*  PopStyle(eHTMLTags aTag);
  void            RemoveStyle(eHTMLTags aTag);

  static  void    ReleaseGlobalObjects(void);

  void            SetTokenAllocator(nsTokenAllocator* aTokenAllocator) { mTokenAllocator=aTokenAllocator; }
  void            SetNodeAllocator(nsNodeAllocator* aNodeAllocator) { mNodeAllocator=aNodeAllocator; }

  nsEntryStack    mStack; //this will hold a list of tagentries...
  PRInt32         mResidualStyleCount;
  PRInt32         mContextTopIndex;

    //break this struct out seperately so that lame compilers don't gack.
    //By using these bits instead of bools, we have a bit-o-memory.
  struct CFlags {
    PRUint8  mHadBody:1;
    PRUint8  mHadFrameset:1;
    PRUint8  mHasOpenHead:1;
    PRUint8  mTransitional:1;
  };

  union {
    PRUint32  mAllBits;
    CFlags    mFlags;
  };

  nsTokenAllocator  *mTokenAllocator;
  nsNodeAllocator   *mNodeAllocator;
  CTableState       *mTableStates;
  nsDeque           mEntities;

#ifdef  NS_DEBUG
  enum { eMaxTags = 100 };
  eHTMLTags       mXTags[eMaxTags];
  PRInt32           *mCounters;

  void            ResetCounters(void);
  void            AllocateCounters(void);
  PRInt32         IncrementCounter(eHTMLTags aTag,nsIParserNode& aNode,nsString& aResult);

  CNamedEntity*   RegisterEntity(const nsAString& aName,const nsAString& aValue);
  CNamedEntity*   GetEntity(const nsAString& aName) const;
#endif
};

/**************************************************************
  Now define the token deallocator class...
 **************************************************************/
class CTokenDeallocator: public nsDequeFunctor{
protected:
  nsFixedSizeAllocator& mArenaPool;

public:
  CTokenDeallocator(nsFixedSizeAllocator& aArenaPool)
    : mArenaPool(aArenaPool) {}

  virtual void* operator()(void* anObject) {
    CToken* aToken = (CToken*)anObject;
    CToken::Destroy(aToken, mArenaPool);
    return 0;
  }
};


/************************************************************************
  ITagHandler class offers an API for taking care of specific tokens.
 ************************************************************************/
class nsITagHandler {
public:

  virtual void          SetString(const nsString &aTheString)=0;
  virtual nsString*     GetString()=0;
  virtual PRBool        HandleToken(CToken* aToken,nsIDTD* aDTD)=0;
  virtual PRBool        HandleCapturedTokens(CToken* aToken,nsIDTD* aDTD)=0;
};

/************************************************************************
  Here are a few useful utility methods...
 ************************************************************************/

/**
 * This method quickly scans the given set of tags,
 * looking for the given tag.
 * @update	gess8/27/98
 * @param   aTag -- tag to be search for in set
 * @param   aTagSet -- set of tags to be searched
 * @return
 */
inline PRInt32 IndexOfTagInSet(PRInt32 aTag,const eHTMLTags* aTagSet,PRInt32 aCount)  {

  const eHTMLTags* theEnd=aTagSet+aCount;
  const eHTMLTags* theTag=aTagSet;

  while(theTag<theEnd) {
    if(aTag==*theTag) {
      return theTag-aTagSet;
    }
    theTag++;
  }

  return kNotFound;
}

/**
 * This method quickly scans the given set of tags,
 * looking for the given tag.
 * @update	gess8/27/98
 * @param   aTag -- tag to be search for in set
 * @param   aTagSet -- set of tags to be searched
 * @return
 */
inline PRBool FindTagInSet(PRInt32 aTag,const eHTMLTags *aTagSet,PRInt32 aCount)  {
  return PRBool(-1<IndexOfTagInSet(aTag,aTagSet,aCount));
}

/**
 * Called from various DTD's to determine the type of data in the buffer...
 * @update	gess 06Jun2000
 * @param   aBuffer: contains a string with first block of html from source document
 * @param   aHasXMLFragment: tells us whether we detect XML in the buffer (based on PI)
 * @return  TRUE if we find HTML
 */

// This really doesn't need to be inline!

inline PRBool BufferContainsHTML(const nsString& aBuffer,
                                 PRBool& aHasXMLFragment)
{
  PRBool result=PR_FALSE;

  aHasXMLFragment=PRBool(-1!=aBuffer.Find("<?XML",PR_TRUE,100));

  PRInt32 theDocTypePos=aBuffer.Find("DOCTYPE",PR_TRUE,0,200);
  if(-1!=theDocTypePos) {
    PRInt32 theHTMLPos=aBuffer.Find("HTML",PR_TRUE,theDocTypePos+8,200);
    if(-1==theHTMLPos) {
      theHTMLPos=aBuffer.Find("ISO/IEC 15445",PR_TRUE,theDocTypePos+8,200);
      if(-1==theHTMLPos) {
        theHTMLPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theDocTypePos+8,200);
      }
    }

    result=PRBool(-1!=theHTMLPos);
  }
  else {
    //worst case scenario: let's look for a few HTML tags...
    PRInt32 theCount = 0;
    PRInt32 theTagCount = 0;

    nsAString::const_iterator iter, end;
    aBuffer.BeginReading(iter);
    aBuffer.EndReading(end);

    if (Distance(iter, end) > 200) {
      end = iter;
      end.advance(200);
    }

    for(theCount = 0; theCount < 5; ++theCount) {
      if (!FindCharInReadable('<', iter, end)) {
        break;
      }

      // we found what may be a start tag...

      ++iter; // step over the '<' character

      nsAString::const_iterator tag_end(iter);

      aBuffer.EndReading(end);

      while (tag_end != end) {
        const PRUnichar c = *tag_end;

        if (c == ' ' || c == '>' || c == '"') {
          break;
        }

        ++tag_end;
      }

      nsHTMLTag theTag = nsHTMLTags::LookupTag(Substring(iter, tag_end));

      if (theTag != eHTMLTag_userdefined) {
        ++theTagCount;
      }

      iter = tag_end;
    }

    // Claim HTML if we find at least 2 real html tags...
    result = (2 <= theTagCount);
  }

  return result;
}


/******************************************************************************
  This little structure is used to compute CRC32 values for our debug validator
 ******************************************************************************/

struct CRCStruct {
  CRCStruct(eHTMLTags aTag,PRInt32 anOp) {mTag=aTag; mOperation=anOp;}
  eHTMLTags mTag;
  PRInt32   mOperation; //usually open or close
};

/**************************************************************
  This defines the topic object used by the observer service.
  The observerService uses a list of these, 1 per topic when
  registering tags.
 **************************************************************/

class nsObserverEntry : public nsIObserverEntry {
public:
  NS_DECL_ISUPPORTS
            nsObserverEntry(const nsAString& aTopic);
  virtual   ~nsObserverEntry();

  NS_IMETHOD Notify(nsIParserNode* aNode,
                    nsIParser* aParser,
                    nsISupports* aWebShell,
                    const PRUint32 aFlags);

  nsresult   AddObserver(nsIElementObserver* aObserver,eHTMLTags aTag);
  void       RemoveObserver(nsIElementObserver* aObserver);
  PRBool     Matches(const nsAString& aTopic);

protected:
  nsAutoString mTopic; // This will rarely be empty, so make it an auto string
  nsVoidArray* mObservers[NS_HTML_TAG_MAX + 1];
  friend class nsMatchesTopic;
};

/*********************************************************************************************/


struct TagList {
  PRUint32    mCount;
  eHTMLTags   mTags[10];
};

/**
 * Find the last member of given taglist on the given context
 * @update	gess 12/14/99
 * @param   aContext
 * @param   aTagList
 * @return  index of tag, or kNotFound if not found
 */
inline PRInt32 LastOf(nsDTDContext& aContext,TagList& aTagList){
  int max = aContext.GetCount();
  int index;
  for(index=max-1;index>=0;index--){
    PRBool result=FindTagInSet(aContext[index],aTagList.mTags,aTagList.mCount);
    if(result) {
      return index;
    }
  }
  return kNotFound;
}

/**
 * Find the first member of given taglist on the given context
 * @update	gess 12/14/99
 * @param   aContext
 * @param   aStartOffset
 * @param   aTagList
 * @return  index of tag, or kNotFound if not found
 */
inline PRInt32 FirstOf(nsDTDContext& aContext,PRInt32 aStartOffset,TagList& aTagList){
  int max = aContext.GetCount();
  int index;
  for(index=aStartOffset;index<max;index++){
    PRBool result=FindTagInSet(aContext[index],aTagList.mTags,aTagList.mCount);
    if(result) {
      return index;
    }
  }
  return kNotFound;
}


/**
 * Call this to find out whether the DTD thinks the tag requires an END tag </xxx>
 * @update	gess 01/04/99
 * @param   id of tag
 * @return  TRUE of the element's end tag is optional
 */
inline PRBool HasOptionalEndTag(eHTMLTags aTag) {
  static eHTMLTags gHasOptionalEndTags[]={eHTMLTag_body,eHTMLTag_colgroup,eHTMLTag_dd,eHTMLTag_dt,
                                                    eHTMLTag_head,eHTMLTag_li,eHTMLTag_option,
                                                    eHTMLTag_p,eHTMLTag_tbody,eHTMLTag_td,eHTMLTag_tfoot,
                                                    eHTMLTag_th,eHTMLTag_thead,eHTMLTag_tr,
                                                    eHTMLTag_userdefined,eHTMLTag_unknown};
  return FindTagInSet(aTag,gHasOptionalEndTags,sizeof(gHasOptionalEndTags)/sizeof(eHTMLTag_body));
}

static void
InPlaceConvertLineEndings( nsAString& aString )
{
    // go from '\r\n' or '\r' to '\n'
  nsAString::iterator iter;
  aString.BeginWriting(iter);

  PRUnichar* S = iter.get();
  size_t N = iter.size_forward();

    // this fragment must be the entire string because
    //  (a) no multi-fragment string is writable, so only an illegal cast could give us one, and
    //  (b) else we would have to do more work (watching for |to| to fall off the end)
  NS_ASSERTION(aString.Length() == N, "You cheated... multi-fragment strings are never writable!");

    // we scan/convert in two phases (but only one pass over the string)
    // until we have to skip a character, we only need to touch end-of-line chars
    // after that, we'll have to start moving every character we want to keep

    // use array indexing instead of pointers, because compilers optimize that better


    // this first loop just converts line endings... no characters get moved
  size_t i = 0;
  PRBool just_saw_cr = PR_FALSE;
  for ( ; i < N; ++i )
    {
        // if it's something we need to convert...
      if ( S[i] == '\r' )
        {
          S[i] = '\n';
          just_saw_cr = PR_TRUE;
        }
      else
        {
            // else, if it's something we need to skip...
            //   i.e., a '\n' immediately following a '\r',
            //   then we need to start moving any character we want to keep
            //   and we have a second loop for that, so get out of this one
          if ( S[i] == '\n' && just_saw_cr )
            break;

          just_saw_cr = PR_FALSE;
        }
    }


    // this second loop handles the rest of the buffer, moving characters down
    //  _and_ converting line-endings as it goes
    //  start the loop at |from = i| so that that |just_saw_cr| gets cleared automatically
  size_t to = i;
  for ( size_t from = i; from < N; ++from )
    {
        // if it's something we need to convert...
      if ( S[from] == '\r' )
        {
          S[to++] = '\n';
          just_saw_cr = PR_TRUE;
        }
      else
        {
            // else, if it's something we need to copy...
            //  i.e., NOT a '\n' immediately following a '\r'
          if ( S[from] != '\n' || !just_saw_cr )
            S[to++] = S[from];

          just_saw_cr = PR_FALSE;
        }
    }

    // if we chopped characters out of the string, we need to shorten it logically
  if ( to < N )
    aString.SetLength(to);
}
#endif