539 lines
17 KiB
C++
539 lines
17 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/*
|
|
* The contents of this file are subject to the Netscape Public License
|
|
* Version 1.0 (the "NPL"); you may not use this file except in
|
|
* compliance with the NPL. You may obtain a copy of the NPL at
|
|
* http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
* for the specific language governing rights and limitations under the
|
|
* NPL.
|
|
*
|
|
* The Initial Developer of this code under the NPL is Netscape
|
|
* Communications Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
* Reserved.
|
|
*/
|
|
|
|
/**
|
|
* MODULE NOTES:
|
|
* @update gess 4/1/98
|
|
*
|
|
* This class does two primary jobs:
|
|
* 1) It iterates the tokens provided during the
|
|
* tokenization process, identifing where elements
|
|
* begin and end (doing validation and normalization).
|
|
* 2) It controls and coordinates with an instance of
|
|
* the IContentSink interface, to coordinate the
|
|
* the production of the content model.
|
|
*
|
|
* The basic operation of this class assumes that an HTML
|
|
* document is non-normalized. Therefore, we don't process
|
|
* the document in a normalized way. Don't bother to look
|
|
* for methods like: doHead() or doBody().
|
|
*
|
|
* Instead, in order to be backward compatible, we must
|
|
* scan the set of tokens and perform this basic set of
|
|
* operations:
|
|
* 1) Determine the token type (easy, since the tokens know)
|
|
* 2) Determine the appropriate section of the HTML document
|
|
* each token belongs in (HTML,HEAD,BODY,FRAMESET).
|
|
* 3) Insert content into our document (via the sink) into
|
|
* the correct section.
|
|
* 4) In the case of tags that belong in the BODY, we must
|
|
* ensure that our underlying document state reflects
|
|
* the appropriate context for our tag.
|
|
*
|
|
* For example,if we see a <TR>, we must ensure our
|
|
* document contains a table into which the row can
|
|
* be placed. This may result in "implicit containers"
|
|
* created to ensure a well-formed document.
|
|
*
|
|
*/
|
|
|
|
#ifndef NS_HTMLPARSER__
|
|
#define NS_HTMLPARSER__
|
|
|
|
#include "nsIParser.h"
|
|
#include "nsDeque.h"
|
|
#include "nsHTMLTokens.h"
|
|
#include "nsParserNode.h"
|
|
#include "nsTokenHandler.h"
|
|
#include "nsParserTypes.h"
|
|
#include "nsIURL.h"
|
|
#include "nsIStreamListener.h"
|
|
#include "nsITokenizerDelegate.h"
|
|
|
|
|
|
#define NS_IHTML_PARSER_IID \
|
|
{0x2ce606b0, 0xbee6, 0x11d1, \
|
|
{0xaa, 0xd9, 0x00, 0x80, 0x5f, 0x8a, 0x3e, 0x14}}
|
|
|
|
|
|
class CTokenizer;
|
|
class IContentSink;
|
|
class nsIHTMLContentSink;
|
|
class nsIURL;
|
|
class nsIDTD;
|
|
|
|
|
|
class nsHTMLParser : public nsIParser, public nsIStreamListener {
|
|
|
|
public:
|
|
friend class CTokenHandler;
|
|
|
|
NS_DECL_ISUPPORTS
|
|
|
|
|
|
/**
|
|
* default constructor
|
|
* @update gess5/11/98
|
|
*/
|
|
nsHTMLParser();
|
|
|
|
|
|
/**
|
|
* Destructor
|
|
* @update gess5/11/98
|
|
*/
|
|
~nsHTMLParser();
|
|
|
|
/**
|
|
* Select given content sink into parser for parser output
|
|
* @update gess5/11/98
|
|
* @param aSink is the new sink to be used by parser
|
|
* @return old sink, or NULL
|
|
*/
|
|
virtual nsIContentSink* SetContentSink(nsIContentSink* aSink);
|
|
|
|
/**
|
|
* Cause parser to parse input from given URL in given mode
|
|
* @update gess5/11/98
|
|
* @param aURL is a descriptor for source document
|
|
* @param aMode is the desired parser mode (Nav, other, etc.)
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
*/
|
|
virtual PRInt32 Parse(nsIURL* aURL,PRBool aIncremental=PR_FALSE);
|
|
|
|
/**
|
|
* Cause parser to parse input from given file in given mode
|
|
* @update gess5/11/98
|
|
* @param aFilename is a path for file document
|
|
* @param aMode is the desired parser mode (Nav, other, etc.)
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
*/
|
|
virtual PRInt32 Parse(const char* aFilename,PRBool aIncremental);
|
|
|
|
/**
|
|
* @update gess5/11/98
|
|
* @param anHTMLString contains a string-full of real HTML
|
|
* @param appendTokens tells us whether we should insert tokens inline, or append them.
|
|
* @return TRUE if all went well -- FALSE otherwise
|
|
*/
|
|
virtual PRInt32 Parse(nsString& anHTMLString,PRBool appendTokens);
|
|
|
|
/**
|
|
* This method gets called (automatically) during incremental parsing
|
|
* @update gess5/11/98
|
|
* @return TRUE if all went well, otherwise FALSE
|
|
*/
|
|
virtual PRInt32 ResumeParse(PRInt32 anIteration);
|
|
|
|
/**
|
|
* Retrieve ptr to internal context vector stack
|
|
* @update gess5/11/98
|
|
* @param stack ptr to be set
|
|
* @return count of items on stack (may be 0)
|
|
*/
|
|
virtual PRInt32 GetStack(PRInt32* aStackPtr);
|
|
|
|
/**
|
|
* Ask parser if a given container is open ANYWHERE on stack
|
|
* @update gess5/11/98
|
|
* @param id of container you want to test for
|
|
* @return TRUE if the given container type is open -- otherwise FALSE
|
|
*/
|
|
virtual PRBool HasOpenContainer(PRInt32 aContainer) const;
|
|
|
|
|
|
/**
|
|
* This method gets called when a start token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the start token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleStartToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when a start token has been consumed, and
|
|
* we want to use default start token handling behavior.
|
|
* This method gets called automatically by handleStartToken.
|
|
*
|
|
* @update gess5/11/98
|
|
* @param aToken is the start token to be handled
|
|
* @param aChildTag is the tag-type of given token
|
|
* @param aNode is a node be updated with info from given token
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleDefaultStartToken(CToken* aToken,eHTMLTags aChildTag,nsCParserNode& aNode);
|
|
|
|
/**
|
|
* This method gets called when an end token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the end token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleEndToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when an entity token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the entity token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleEntityToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when a comment token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the comment token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleCommentToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when a skipped-content token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the skipped-content token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleSkippedContentToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when an attribute token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the attribute token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleAttributeToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when a script token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the script token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleScriptToken(CToken* aToken);
|
|
|
|
/**
|
|
* This method gets called when a style token has been consumed and needs
|
|
* to be handled (possibly added to content model via sink).
|
|
* @update gess5/11/98
|
|
* @param aToken is the style token to be handled
|
|
* @return TRUE if the token was handled.
|
|
*/
|
|
PRBool HandleStyleToken(CToken* aToken);
|
|
|
|
//*********************************************
|
|
// These methods are callback methods used by
|
|
// net lib to let us know about our inputstream.
|
|
//*********************************************
|
|
NS_IMETHOD GetBindInfo(void);
|
|
NS_IMETHOD OnProgress(PRInt32 Progress, PRInt32 ProgressMax, const char *msg);
|
|
NS_IMETHOD OnStartBinding(void);
|
|
NS_IMETHOD OnDataAvailable(nsIInputStream *pIStream, PRInt32 length);
|
|
NS_IMETHOD OnStopBinding(PRInt32 status, const char *msg);
|
|
|
|
protected:
|
|
|
|
/**
|
|
* This method gets called when the tokens have been consumed, and it's time
|
|
* to build the model via the content sink.
|
|
* @update gess5/11/98
|
|
* @return YES if model building went well -- NO otherwise.
|
|
*/
|
|
PRBool IterateTokens();
|
|
|
|
/**
|
|
* Retrieves the tag type for the element on the context vector stack at given pos.
|
|
* @update gess5/11/98
|
|
* @param pos of item you want to retrieve
|
|
* @return tag type (may be unknown)
|
|
*/
|
|
eHTMLTags NodeAt(PRInt32 aPos) const;
|
|
|
|
/**
|
|
* Retrieve the tag type of the topmost item on context vector stack
|
|
* @update gess5/11/98
|
|
* @return tag type (may be unknown)
|
|
*/
|
|
eHTMLTags GetTopNode() const;
|
|
|
|
/**
|
|
* Retrieve the number of items on context vector stack
|
|
* @update gess5/11/98
|
|
* @return count of items on stack -- may be 0
|
|
*/
|
|
PRInt32 GetStackPos() const;
|
|
|
|
/**
|
|
* Causes the parser to scan foward, collecting nearby (sequential)
|
|
* attribute tokens into the given node.
|
|
* @update gess5/11/98
|
|
* @param node to store attributes
|
|
* @return number of attributes added to node.
|
|
*/
|
|
PRInt32 CollectAttributes(nsCParserNode& aNode);
|
|
|
|
/**
|
|
* Causes the next skipped-content token (if any) to
|
|
* be consumed by this node.
|
|
* @update gess5/11/98
|
|
* @param node to consume skipped-content
|
|
* @return number of skipped-content tokens consumed.
|
|
*/
|
|
PRInt32 CollectSkippedContent(nsCParserNode& aNode);
|
|
|
|
/**
|
|
* Causes token handlers to be registered for this parser.
|
|
* DO NOT CALL THIS! IT'S DEPRECATED!
|
|
* @update gess5/11/98
|
|
*/
|
|
void InitializeDefaultTokenHandlers();
|
|
|
|
/**
|
|
* DEPRECATED
|
|
* @update gess5/11/98
|
|
*/
|
|
CTokenHandler* GetTokenHandler(const nsString& aString) const;
|
|
|
|
/**
|
|
* DEPRECATED
|
|
* @update gess5/11/98
|
|
*/
|
|
CTokenHandler* GetTokenHandler(eHTMLTokenTypes aType) const;
|
|
|
|
/**
|
|
* DEPRECATED
|
|
* @update gess5/11/98
|
|
*/
|
|
CTokenHandler* AddTokenHandler(CTokenHandler* aHandler);
|
|
|
|
/**
|
|
* DEPRECATED
|
|
* @update gess5/11/98
|
|
*/
|
|
nsHTMLParser& DeleteTokenHandlers(void);
|
|
|
|
//*************************************************
|
|
//these cover methods mimic the sink, and are used
|
|
//by the parser to manage its context-stack.
|
|
//*************************************************
|
|
|
|
/**
|
|
* This cover method opens the given node as a HTML item in
|
|
* content sink.
|
|
* @update gess5/11/98
|
|
* @param HTML (node) to be opened in content sink.
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool OpenHTML(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
*
|
|
* @update gess5/11/98
|
|
* @param
|
|
* @return
|
|
*/
|
|
PRBool CloseHTML(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method opens the given node as a head item in
|
|
* content sink.
|
|
* @update gess5/11/98
|
|
* @param HEAD (node) to be opened in content sink.
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool OpenHead(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method causes the content-sink head to be closed
|
|
* @update gess5/11/98
|
|
* @param aNode is the node to be closed in sink (usually ignored)
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool CloseHead(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method opens the given node as a body item in
|
|
* content sink.
|
|
* @update gess5/11/98
|
|
* @param BODY (node) to be opened in content sink.
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool OpenBody(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method causes the content-sink body to be closed
|
|
* @update gess5/11/98
|
|
* @param aNode is the body node to be closed in sink (usually ignored)
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool CloseBody(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method opens the given node as a form item in
|
|
* content sink.
|
|
* @update gess5/11/98
|
|
* @param FORM (node) to be opened in content sink.
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool OpenForm(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method causes the content-sink form to be closed
|
|
* @update gess5/11/98
|
|
* @param aNode is the form node to be closed in sink (usually ignored)
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool CloseForm(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method opens the given node as a frameset item in
|
|
* content sink.
|
|
* @update gess5/11/98
|
|
* @param FRAMESET (node) to be opened in content sink.
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool OpenFrameset(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method causes the content-sink frameset to be closed
|
|
* @update gess5/11/98
|
|
* @param aNode is the frameeset node to be closed in sink (usually ignored)
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool CloseFrameset(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method opens the given node as a generic container in
|
|
* content sink.
|
|
* @update gess5/11/98
|
|
* @param generic container (node) to be opened in content sink.
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool OpenContainer(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method causes a generic containre in the content-sink to be closed
|
|
* @update gess5/11/98
|
|
* @param aNode is the node to be closed in sink (usually ignored)
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool CloseContainer(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* This cover method causes the topmost container to be closed in sink
|
|
* @update gess5/11/98
|
|
* @return TRUE if all went well.
|
|
*/
|
|
PRBool CloseTopmostContainer();
|
|
|
|
/**
|
|
* Cause all containers down to topmost given tag to be closed
|
|
* @update gess5/11/98
|
|
* @param aTag is the tag at which auto-closure should stop (inclusive)
|
|
* @return TRUE if all went well -- otherwise FALSE
|
|
*/
|
|
PRBool CloseContainersTo(eHTMLTags aTag);
|
|
|
|
/**
|
|
* Cause all containers down to given position to be closed
|
|
* @update gess5/11/98
|
|
* @param anIndex is the stack pos at which auto-closure should stop (inclusive)
|
|
* @return TRUE if all went well -- otherwise FALSE
|
|
*/
|
|
PRBool CloseContainersTo(PRInt32 anIndex);
|
|
|
|
/**
|
|
* Causes leaf to be added to sink at current vector pos.
|
|
* @update gess5/11/98
|
|
* @param aNode is leaf node to be added.
|
|
* @return TRUE if all went well -- FALSE otherwise.
|
|
*/
|
|
PRBool AddLeaf(const nsIParserNode& aNode);
|
|
|
|
/**
|
|
* Determine if the given tag is open in the context vector stack.
|
|
* @update gess5/11/98
|
|
* @param aTag is the type of tag you want to test for openness
|
|
* @return TRUE if open, FALSE otherwise.
|
|
*/
|
|
PRBool IsOpen(eHTMLTags aTag) const;
|
|
|
|
/**
|
|
* Finds the topmost occurance of given tag within context vector stack.
|
|
* @update gess5/11/98
|
|
* @param tag to be found
|
|
* @return index of topmost tag occurance -- may be -1 (kNotFound).
|
|
*/
|
|
PRInt32 GetTopmostIndex(eHTMLTags aTag) const;
|
|
|
|
/**
|
|
* Causes auto-closures of context vector stack in order to find a
|
|
* proper home for the given child. Propagation may also occur as
|
|
* a fall out.
|
|
* @update gess5/11/98
|
|
* @param child to be added (somewhere) to context vector stack.
|
|
* @return TRUE if succeeds, otherwise FALSE
|
|
*/
|
|
PRBool ReduceContextStackFor(PRInt32 aChildTag);
|
|
|
|
/**
|
|
* Attempt forward and/or backward propagation for the given
|
|
* child within the current context vector stack.
|
|
* @update gess5/11/98
|
|
* @param type of child to be propagated.
|
|
* @return TRUE if succeeds, otherwise FALSE
|
|
*/
|
|
PRBool CreateContextStackFor(PRInt32 aChildTag);
|
|
|
|
private:
|
|
PRInt32 ParseFileIncrementally(const char* aFilename); //XXX ONLY FOR DEBUG PURPOSES...
|
|
|
|
protected:
|
|
//*********************************************
|
|
// And now, some data members...
|
|
//*********************************************
|
|
|
|
nsIHTMLContentSink* mSink;
|
|
CTokenizer* mTokenizer;
|
|
|
|
PRInt32 mContextStack[50];
|
|
PRInt32 mContextStackPos;
|
|
|
|
CTokenHandler* mTokenHandlers[100];
|
|
PRInt32 mTokenHandlerCount;
|
|
nsDequeIterator* mCurrentPos;
|
|
|
|
nsIDTD* mDTD;
|
|
eParseMode mParseMode;
|
|
PRBool mHasOpenForm;
|
|
PRBool mIncremental;
|
|
ITokenizerDelegate* mDelegate;
|
|
};
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|