/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* * The contents of this file are subject to the Netscape Public License * Version 1.0 (the "NPL"); you may not use this file except in * compliance with the NPL. You may obtain a copy of the NPL at * http://www.mozilla.org/NPL/ * * Software distributed under the NPL is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL * for the specific language governing rights and limitations under the * NPL. * * The Initial Developer of this code under the NPL is Netscape * Communications Corporation. Portions created by Netscape are * Copyright (C) 1998 Netscape Communications Corporation. All Rights * Reserved. */ /** * MODULE NOTES: * @update gess 4/1/98 * */ #include "nsHTMLTokenizer.h" #include "nsParser.h" #include "nsScanner.h" #include "nsDTDUtils.h" #include "nsElementTable.h" /************************************************************************ And now for the main class -- nsHTMLTokenizer... ************************************************************************/ static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID); static NS_DEFINE_IID(kITokenizerIID, NS_ITOKENIZER_IID); static NS_DEFINE_IID(kClassIID, NS_HTMLTOKENIZER_IID); static CTokenDeallocator gTokenKiller; static CTokenRecycler gTokenRecycler; static nsAutoString gEmpty; /** * This method gets called as part of our COM-like interfaces. * Its purpose is to create an interface to parser object * of some type. * * @update gess 4/8/98 * @param nsIID id of object to discover * @param aInstancePtr ptr to newly discovered interface * @return NS_xxx result code */ nsresult nsHTMLTokenizer::QueryInterface(const nsIID& aIID, void** aInstancePtr) { if (NULL == aInstancePtr) { return NS_ERROR_NULL_POINTER; } if(aIID.Equals(kISupportsIID)) { //do IUnknown... *aInstancePtr = (nsIDTD*)(this); } else if(aIID.Equals(kITokenizerIID)) { //do IParser base class... *aInstancePtr = (nsIDTD*)(this); } else if(aIID.Equals(kClassIID)) { //do this class... *aInstancePtr = (nsHTMLTokenizer*)(this); } else { *aInstancePtr=0; return NS_NOINTERFACE; } NS_ADDREF_THIS(); return NS_OK; } /** * This method is defined in nsIParser. It is used to * cause the COM-like construction of an nsParser. * * @update gess 4/8/98 * @param nsIParser** ptr to newly instantiated parser * @return NS_xxx error result */ NS_HTMLPARS nsresult NS_NewHTMLTokenizer(nsIDTD** aInstancePtrResult) { nsHTMLTokenizer* it = new nsHTMLTokenizer(); if (it == 0) { return NS_ERROR_OUT_OF_MEMORY; } return it->QueryInterface(kClassIID, (void **) aInstancePtrResult); } NS_IMPL_ADDREF(nsHTMLTokenizer) NS_IMPL_RELEASE(nsHTMLTokenizer) /** * Default constructor * * @update gess 4/9/98 * @param * @return */ nsHTMLTokenizer::nsHTMLTokenizer() : nsITokenizer(), mTokenDeque(gTokenKiller) { NS_INIT_REFCNT(); mDoXMLEmptyTags=PR_FALSE; } /** * Default constructor * * @update gess 4/9/98 * @param * @return */ nsHTMLTokenizer::~nsHTMLTokenizer(){ } /******************************************************************* Here begins the real working methods for the tokenizer. *******************************************************************/ void AddToken(CToken*& aToken,nsresult aResult,nsDeque& aDeque) { if(aToken) { if(NS_SUCCEEDED(aResult)) { aDeque.Push(aToken); } else { delete aToken; aToken=0; } } } /** * Retrieve a ptr to the global token recycler... * @update gess8/4/98 * @return ptr to recycler (or null) */ nsITokenRecycler* nsHTMLTokenizer::GetTokenRecycler(void) { return &gTokenRecycler; } /** * This method provides access to the topmost token in the tokenDeque. * The token is not really removed from the list. * @update gess8/2/98 * @return ptr to token */ CToken* nsHTMLTokenizer::PeekToken() { return (CToken*)mTokenDeque.Peek(); } /** * This method provides access to the topmost token in the tokenDeque. * The token is really removed from the list; if the list is empty we return 0. * @update gess8/2/98 * @return ptr to token or NULL */ CToken* nsHTMLTokenizer::PopToken() { return (CToken*)mTokenDeque.PopFront(); } /** * * @update gess8/2/98 * @param * @return */ CToken* nsHTMLTokenizer::PushTokenFront(CToken* theToken) { mTokenDeque.PushFront(theToken); return theToken; } /** * * @update gess8/2/98 * @param * @return */ CToken* nsHTMLTokenizer::PushToken(CToken* theToken) { mTokenDeque.Push(theToken); return theToken; } /** * * @update gess12/29/98 * @param * @return */ PRInt32 nsHTMLTokenizer::GetCount(void) { return mTokenDeque.GetSize(); } CToken* nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex){ return (CToken*)mTokenDeque.ObjectAt(anIndex); } /** * This method repeatedly called by the tokenizer. * Each time, we determine the kind of token were about to * read, and then we call the appropriate method to handle * that token type. * * @update gess 3/25/98 * @param aChar: last char read * @param aScanner: see nsScanner.h * @param anErrorCode: arg that will hold error condition * @return new token or null */ nsresult nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner) { nsresult result=NS_OK; if(NS_OK==result){ PRUnichar theChar; result=aScanner.GetChar(theChar); CToken* theToken=0; switch(result) { case kEOF: //We convert from eof to complete here, because we never really tried to get data. //All we did was try to see if data was available, which it wasn't. //It's important to return process complete, so that controlling logic can know that //everything went well, but we're done with token processing. break; case NS_OK: default: switch(theChar) { case kLessThan: result=ConsumeTag(theChar,theToken,aScanner); break; case kAmpersand: result=ConsumeEntity(theChar,theToken,aScanner); break; case kCR: case kLF: result=ConsumeNewline(theChar,theToken,aScanner); break; case kNotFound: break; default: if(!nsString::IsSpace(theChar)) { nsAutoString temp(theChar); result=ConsumeText(temp,theToken,aScanner); break; } result=ConsumeWhitespace(theChar,theToken,aScanner); break; } //switch break; } //switch } //if return result; } /** * This method is called just after a "<" has been consumed * and we know we're at the start of some kind of tagged * element. We don't know yet if it's a tag or a comment. * * @update gess 5/12/98 * @param aChar is the last char read * @param aScanner is represents our input source * @param aToken is the out arg holding our new token * @return error code. */ nsresult nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,CToken*& aToken,nsScanner& aScanner) { nsresult result=aScanner.GetChar(aChar); if(NS_OK==result) { switch(aChar) { case kForwardSlash: PRUnichar ch; result=aScanner.Peek(ch); if(NS_OK==result) { if(nsString::IsAlpha(ch)) { result=ConsumeEndTag(aChar,aToken,aScanner); } else result=ConsumeComment(aChar,aToken,aScanner); }//if break; case kExclamation: result=ConsumeComment(aChar,aToken,aScanner); break; case kQuestionMark: //it must be an XML processing instruction... result=ConsumeProcessingInstruction(aChar,aToken,aScanner); break; default: if(nsString::IsAlpha(aChar)) result=ConsumeStartTag(aChar,aToken,aScanner); else if(kEOF!=aChar) { nsAutoString temp("<"); result=ConsumeText(temp,aToken,aScanner); } } //switch } //if return result; } /** * This method is called just after we've consumed a start * tag, and we now have to consume its attributes. * * @update gess 3/25/98 * @param aChar: last char read * @param aScanner: see nsScanner.h * @return */ nsresult nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,CStartToken* aToken,nsScanner& aScanner) { PRBool done=PR_FALSE; nsresult result=NS_OK; PRInt16 theAttrCount=0; CTokenRecycler* theRecycler=(CTokenRecycler*)GetTokenRecycler(); while((!done) && (result==NS_OK)) { CToken* theToken= (CAttributeToken*)theRecycler->CreateTokenOfType(eToken_attribute,eHTMLTag_unknown,gEmpty); if(theToken){ result=theToken->Consume(aChar,aScanner); //tell new token to finish consuming text... //Much as I hate to do this, here's some special case code. //This handles the case of empty-tags in XML. Our last //attribute token will come through with a text value of "" //and a textkey of "/". We should destroy it, and tell the //start token it was empty. if(NS_SUCCEEDED(result)) { nsString& key=((CAttributeToken*)theToken)->GetKey(); nsString& text=theToken->GetStringValueXXX(); if((mDoXMLEmptyTags) && (key[0]==kForwardSlash) && (0==text.Length())){ //tada! our special case! Treat it like an empty start tag... aToken->SetEmpty(PR_TRUE); theRecycler->RecycleToken(theToken); } else { theAttrCount++; AddToken(theToken,result,mTokenDeque); } } else if(NS_ERROR_HTMLPARSER_BADATTRIBUTE==result){ aToken->SetEmpty(PR_TRUE); theRecycler->RecycleToken(theToken); result=NS_OK; } }//if if(NS_SUCCEEDED(result)){ result=aScanner.SkipWhitespace(); if(NS_SUCCEEDED(result)) { result=aScanner.Peek(aChar); if(NS_SUCCEEDED(result) && (aChar==kGreaterThan)) { //you just ate the '>' aScanner.GetChar(aChar); //skip the '>' done=PR_TRUE; }//if } }//if }//while aToken->SetAttributeCount(theAttrCount); return result; } /** * This is a special case method. It's job is to consume * all of the given tag up to an including the end tag. * * @param aChar: last char read * @param aScanner: see nsScanner.h * @param anErrorCode: arg that will hold error condition * @return new token or null */ nsresult nsHTMLTokenizer::ConsumeContentToEndTag(PRUnichar aChar, eHTMLTags aChildTag, nsScanner& aScanner, CToken*& aToken){ //In the case that we just read the given tag, we should go and //consume all the input until we find a matching end tag. nsAutoString endTag(""); CTokenRecycler* theRecycler=(CTokenRecycler*)GetTokenRecycler(); aToken=theRecycler->CreateTokenOfType(eToken_skippedcontent,aChildTag,endTag); return aToken->Consume(aChar,aScanner); //tell new token to finish consuming text... } /** * * @update gess12/28/98 * @param * @return */ nsresult nsHTMLTokenizer::HandleSkippedContent(nsScanner& aScanner,CToken*& aToken) { nsresult result=NS_OK; eHTMLTags theTag=(eHTMLTags)aToken->GetTypeID(); if(eHTMLTag_unknown!=gHTMLElements[theTag].mSkipTarget) { //Do special case handling for