WIP for push based tokenization

git-svn-id: svn://10.0.0.236/trunk@1683 18797224-902f-48f8-a5cc-f745e15eee43
This commit is contained in:
rickg 1998-05-14 22:19:08 +00:00
parent affe63fab7
commit 4e262a1fe0
18 changed files with 1630 additions and 590 deletions

View File

@ -81,42 +81,49 @@ nsIDTD* CNavDelegate::GetDTD(void) const{
* and we know we're at the start of some kind of tagged
* element. We don't know yet if it's a tag or a comment.
*
* @update gess 3/25/98
* @param
* @return
* @update gess 5/12/98
* @param aChar is the last char read
* @param aScanner is represents our input source
* @param aToken is the out arg holding our new token
* @return error code (may return kInterrupted).
*/
PRInt32 CNavDelegate::ConsumeTag(PRUnichar aChar,CScanner& aScanner,CToken*& aToken) {
nsAutoString empty("");
PRInt32 result=aScanner.GetChar(aChar);
switch(aChar) {
case kForwardSlash:
PRUnichar ch;
result=aScanner.Peek(ch);
if(nsString::IsAlpha(ch))
aToken=new CEndToken(empty);
else aToken=new CCommentToken(empty); //Special case: </ ...> is treated as a comment
break;
case kExclamation:
aToken=new CCommentToken(empty);
break;
default:
if(nsString::IsAlpha(aChar))
return ConsumeStartTag(aChar,aScanner,aToken);
else if(kEOF!=aChar) {
nsAutoString temp("<");
return ConsumeText(temp,aScanner,aToken);
}
} //switch
if(kNoError==result) {
if(0!=aToken) {
result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(result) {
delete aToken;
aToken=0;
}
}
switch(aChar) {
case kForwardSlash:
PRUnichar ch;
result=aScanner.Peek(ch);
if(kNoError==result) {
if(nsString::IsAlpha(ch))
aToken=new CEndToken(empty);
else aToken=new CCommentToken(empty); //Special case: </ ...> is treated as a comment
}//if
break;
case kExclamation:
aToken=new CCommentToken(empty);
break;
default:
if(nsString::IsAlpha(aChar))
return ConsumeStartTag(aChar,aScanner,aToken);
else if(kEOF!=aChar) {
nsAutoString temp("<");
return ConsumeText(temp,aScanner,aToken);
}
} //switch
if((0!=aToken) && (kNoError==result)) {
result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(result) {
delete aToken;
aToken=0;
}
} //if
} //if
return result;
}
@ -131,20 +138,26 @@ PRInt32 CNavDelegate::ConsumeTag(PRUnichar aChar,CScanner& aScanner,CToken*& aTo
*/
PRInt32 CNavDelegate::ConsumeAttributes(PRUnichar aChar,CScanner& aScanner) {
PRBool done=PR_FALSE;
nsAutoString as("");
PRInt32 result=kNoError;
nsAutoString as("");
while((!done) && (result==kNoError)) {
CToken* theToken= new CAttributeToken(as);
if(theToken){
result= theToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
CToken* theToken= new CAttributeToken(as);
if(theToken){
result=theToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(kNoError==result){
mTokenDeque.Push(theToken);
}
aScanner.Peek(aChar);
if(aChar==kGreaterThan) { //you just ate the '>'
aScanner.GetChar(aChar); //skip the '>'
done=PR_TRUE;
}
}
}//if
}//if
if(kNoError==result){
result=aScanner.Peek(aChar);
if(aChar==kGreaterThan) { //you just ate the '>'
aScanner.GetChar(aChar); //skip the '>'
done=PR_TRUE;
}//if
}//if
}//while
return result;
}
@ -166,8 +179,7 @@ PRInt32 CNavDelegate::ConsumeContentToEndTag(const nsString& aString,PRUnichar a
endTag.Append(aString);
endTag.Append(">");
aToken=new CSkippedContentToken(endTag);
PRInt32 result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
return result;
return aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
}
/**
@ -183,38 +195,43 @@ PRInt32 CNavDelegate::ConsumeContentToEndTag(const nsString& aString,PRUnichar a
PRInt32 CNavDelegate::ConsumeStartTag(PRUnichar aChar,CScanner& aScanner,CToken*& aToken) {
aToken=new CStartToken(nsAutoString(""));
PRInt32 result=kNoError;
if(aToken) {
result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(((CStartToken*)aToken)->IsAttributed()) {
result=ConsumeAttributes(aChar,aScanner);
}
//now that that's over with, we have one more problem to solve.
//In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
//consume all the content itself.
nsString& str=aToken->GetText();
CToken* skippedToken=0;
if(str.EqualsIgnoreCase("SCRIPT") ||
str.EqualsIgnoreCase("STYLE") ||
str.EqualsIgnoreCase("TITLE") ||
str.EqualsIgnoreCase("TEXTAREA")) {
result=ConsumeContentToEndTag(str,aChar,aScanner,skippedToken);
if(skippedToken){
//now we strip the ending sequence from our new SkippedContent token...
PRInt32 slen=str.Length()+3;
nsString& skippedText=skippedToken->GetText();
skippedText.Cut(skippedText.Length()-slen,slen);
mTokenDeque.Push(skippedToken);
if(kNoError==result) {
if(((CStartToken*)aToken)->IsAttributed()) {
result=ConsumeAttributes(aChar,aScanner);
}
//now that that's over with, we have one more problem to solve.
//In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
//consume all the content itself.
if(kNoError==result) {
nsString& str=aToken->GetText();
CToken* skippedToken=0;
if(str.EqualsIgnoreCase("SCRIPT") ||
str.EqualsIgnoreCase("STYLE") ||
str.EqualsIgnoreCase("TITLE") ||
str.EqualsIgnoreCase("TEXTAREA")) {
result=ConsumeContentToEndTag(str,aChar,aScanner,skippedToken);
//In the case that we just read a given tag, we should go and
//consume all the tag content itself (and throw it all away).
if((kNoError==result) && skippedToken){
//now we strip the ending sequence from our new SkippedContent token...
PRInt32 slen=str.Length()+3;
nsString& skippedText=skippedToken->GetText();
skippedText.Cut(skippedText.Length()-slen,slen);
mTokenDeque.Push(skippedToken);
//In the case that we just read a given tag, we should go and
//consume all the tag content itself (and throw it all away).
CEndToken* endtoken=new CEndToken(str);
mTokenDeque.Push(endtoken);
CEndToken* endtoken=new CEndToken(str);
mTokenDeque.Push(endtoken);
} //if
} //if
} //if
} //if
}
} //if
return result;
}
@ -231,19 +248,22 @@ PRInt32 CNavDelegate::ConsumeStartTag(PRUnichar aChar,CScanner& aScanner,CToken*
PRInt32 CNavDelegate::ConsumeEntity(PRUnichar aChar,CScanner& aScanner,CToken*& aToken) {
PRUnichar ch;
PRInt32 result=aScanner.GetChar(ch);
if(nsString::IsAlpha(ch)) { //handle common enity references &xxx; or &#000.
aToken = new CEntityToken(nsAutoString(""));
result = aToken->Consume(ch,aScanner); //tell new token to finish consuming text...
}
else if(kHashsign==ch) {
aToken = new CEntityToken(nsAutoString(""));
result=aToken->Consume(0,aScanner);
}
else {
//oops, we're actually looking at plain text...
nsAutoString temp("&");
result=ConsumeText(temp,aScanner,aToken);
}
if(kNoError==result) {
if(nsString::IsAlpha(ch)) { //handle common enity references &xxx; or &#000.
aToken = new CEntityToken(nsAutoString(""));
result = aToken->Consume(ch,aScanner); //tell new token to finish consuming text...
}
else if(kHashsign==ch) {
aToken = new CEntityToken(nsAutoString(""));
result=aToken->Consume(0,aScanner);
}
else {
//oops, we're actually looking at plain text...
nsAutoString temp("&");
result=ConsumeText(temp,aScanner,aToken);
}
}//if
return result;
}
@ -336,36 +356,54 @@ PRInt32 CNavDelegate::ConsumeNewline(PRUnichar aChar,CScanner& aScanner,CToken*&
* @return new token or null
*/
PRInt32 CNavDelegate::GetToken(CScanner& aScanner,CToken*& aToken){
PRInt32 result=kNoError;
PRUnichar aChar;
aToken=0;
if(mTokenDeque.GetSize()>0) {
aToken=(CToken*)mTokenDeque.Pop();
return result;
return kNoError;
}
aToken=0;
while(!aScanner.Eof()) {
PRInt32 result=kNoError;
if(kNoError==result){
PRUnichar aChar;
result=aScanner.GetChar(aChar);
switch(aChar) {
case kAmpersand:
return ConsumeEntity(aChar,aScanner,aToken);
case kLessThan:
return ConsumeTag(aChar,aScanner,aToken);
case kCR: case kLF:
return ConsumeNewline(aChar,aScanner,aToken);
case kNotFound:
switch(result) {
case kEOF:
break;
case kInterrupted:
aScanner.RewindToMark();
break;
case kNoError:
default:
if(!nsString::IsSpace(aChar)) {
nsAutoString temp(aChar);
return ConsumeText(temp,aScanner,aToken);
}
else return ConsumeWhitespace(aChar,aScanner,aToken);
break;
switch(aChar) {
case kLessThan:
return ConsumeTag(aChar,aScanner,aToken);
case kAmpersand:
return ConsumeEntity(aChar,aScanner,aToken);
case kCR: case kLF:
return ConsumeNewline(aChar,aScanner,aToken);
case kNotFound:
break;
default:
if(!nsString::IsSpace(aChar)) {
nsAutoString temp(aChar);
return ConsumeText(temp,aScanner,aToken);
}
else return ConsumeWhitespace(aChar,aScanner,aToken);
break;
} //switch
break;
} //switch
if(result==kEOF)
result=0;
} //while
if(kNoError==result)
result=aScanner.Eof();
} //while
return result;
}

View File

@ -31,6 +31,7 @@
#include "CNavDTD.h"
#include "prenv.h" //this is here for debug reasons...
#include "plstr.h"
#include <fstream.h>
#ifdef XP_PC
#include <direct.h> //this is here for debug reasons...
#endif
@ -40,11 +41,13 @@ static NS_DEFINE_IID(kClassIID, NS_IHTML_PARSER_IID);
static NS_DEFINE_IID(kIParserIID, NS_IPARSER_IID);
static const char* kNullURL = "Error: Null URL given";
static const char* kNullFilename= "Error: Null filename given";
static const char* kNullTokenizer = "Error: Unable to construct tokenizer";
static const char* kNullToken = "Error: Null token given";
static const char* kInvalidTagStackPos = "Error: invalid tag stack position";
static char* gVerificationOutputDir=0;
static char* gVerificationOutputDir=0;
static int rickGDebug=0;
/**
* This method is defined in nsIParser. It is used to
@ -456,92 +459,198 @@ PRBool nsHTMLParser::IterateTokens() {
return result;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool nsHTMLParser::Parse(nsIURL* aURL){
eParseMode theMode=eParseMode_navigator;
eParseMode DetermineParseMode() {
const char* theModeStr= PR_GetEnv("PARSE_MODE");
const char* other="other";
eParseMode result=eParseMode_navigator;
if(theModeStr)
if(0==nsCRT::strcasecmp(other,theModeStr))
theMode=eParseMode_other;
return Parse(aURL,theMode);
result=eParseMode_other;
return result;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool nsHTMLParser::Parse(nsIURL* aURL,eParseMode aMode){
NS_PRECONDITION(0!=aURL,kNullURL);
PRBool result=PR_FALSE;
if(aURL) {
void GetDelegateAndDTD(eParseMode aMode,ITokenizerDelegate*& aDelegate,nsIDTD*& aDTD) {
switch(aMode) {
case eParseMode_navigator:
aDelegate=new CNavDelegate(); break;
case eParseMode_other:
aDelegate=new COtherDelegate(); break;
default:
break;
}
if(aDelegate)
aDTD=aDelegate->GetDTD();
}
result=PR_TRUE;
mParseMode=aMode;
ITokenizerDelegate* theDelegate=0;
mDTD=0;
switch(mParseMode) {
case eParseMode_navigator:
theDelegate=new CNavDelegate();
if(theDelegate)
mDTD=theDelegate->GetDTD();
break;
case eParseMode_other:
theDelegate=new COtherDelegate();
if(theDelegate)
mDTD=theDelegate->GetDTD();
break;
default:
break;
}
if(!theDelegate) {
NS_ERROR(kNullTokenizer);
return PR_FALSE;
}
if(mDTD)
mDTD->SetParser(this);
mTokenizer=new CTokenizer(aURL, theDelegate, mParseMode);
/**
* This DEBUG ONLY method is used to simulate a network-based
* i/o model where data comes in incrementally.
*
* @update gess 5/13/98
* @param aFilename is the name of the disk file to use for testing.
* @return error code (kNoError means ok)
*/
PRInt32 nsHTMLParser::ParseFileIncrementally(const char* aFilename){
PRInt32 result=kBadFilename;
fstream* mFileStream;
nsString theBuffer;
PRInt32 iter=-1;
const int kBufSize=10;
mFileStream=new fstream(aFilename,ios::in|ios::binary);
if(mFileStream) {
result=kNoError;
while((kNoError==result) || (kInterrupted==result)) {
//read some data from the file...
char buf[kBufSize];
buf[kBufSize]=0;
if(mFileStream) {
mFileStream->read(buf,kBufSize);
PRInt32 numread=mFileStream->gcount();
if(numread>0) {
theBuffer.Truncate();
theBuffer.Append(buf);
mTokenizer->Append(theBuffer);
result=ResumeParse(++iter);
}
}
mSink->WillBuildModel();
#ifdef __INCREMENTAL
int iter=-1;
for(;;){
mSink->WillResume();
mTokenizer->TokenizeAvailable(++iter);
mSink->WillInterrupt();
}
#else
mTokenizer->Tokenize();
#endif
result=IterateTokens();
mSink->DidBuildModel();
mFileStream->close();
delete mFileStream;
}
return result;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*/
PRBool nsHTMLParser::Parse(const char* aFilename,PRBool aIncremental){
NS_PRECONDITION(0!=aFilename,kNullFilename);
PRInt32 status=kBadFilename;
mIncremental=aIncremental;
mParseMode=DetermineParseMode();
if(aFilename) {
GetDelegateAndDTD(mParseMode,mDelegate,mDTD);
if(mDelegate) {
if(mDTD)
mDTD->SetParser(this);
mSink->WillBuildModel();
//ok, time to create our tokenizer and begin the process
if(aIncremental) {
mTokenizer=new CTokenizer(mDelegate,mParseMode);
status=ParseFileIncrementally(aFilename);
}
else {
//ok, time to create our tokenizer and begin the process
mTokenizer=new CTokenizer(aFilename,mDelegate,mParseMode);
status=ResumeParse(0);
}
mSink->DidBuildModel();
}//if
}
return status;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*/
PRInt32 nsHTMLParser::Parse(nsIURL* aURL,PRBool aIncremental ){
NS_PRECONDITION(0!=aURL,kNullURL);
PRInt32 status=kBadURL;
if(rickGDebug)
status=Parse("c:/temp/temp.html",PR_TRUE);
mIncremental=aIncremental;
mParseMode=DetermineParseMode();
if(aURL) {
GetDelegateAndDTD(mParseMode,mDelegate,mDTD);
if(mDelegate) {
if(mDTD)
mDTD->SetParser(this);
mSink->WillBuildModel();
//ok, time to create our tokenizer and begin the process
if(aIncremental) {
mTokenizer=new CTokenizer(mDelegate,mParseMode);
status=aURL->Open(this);
}
else {
mTokenizer=new CTokenizer(aURL,mDelegate,mParseMode);
status=ResumeParse(0);
mSink->DidBuildModel();
}
}//if
}
return status;
}
/**
* Call this method if all you want to do is parse 1 string full of HTML text.
*
* @update gess5/11/98
* @param anHTMLString contains a string-full of real HTML
* @param appendTokens tells us whether we should insert tokens inline, or append them.
* @return TRUE if all went well -- FALSE otherwise
*/
PRInt32 nsHTMLParser::Parse(nsString& aSourceBuffer,PRBool appendTokens){
PRInt32 result=kNoError;
mSink->WillBuildModel();
mTokenizer->Append(aSourceBuffer);
result=ResumeParse(0);
mSink->DidBuildModel();
return result;
}
/**
* This routine is called to cause the parser to continue
@ -553,17 +662,21 @@ PRBool nsHTMLParser::Parse(nsIURL* aURL,eParseMode aMode){
* @param
* @return PR_TRUE if parsing concluded successfully.
*/
PRBool nsHTMLParser::ResumeParse() {
PRInt32 nsHTMLParser::ResumeParse(PRInt32 anIteration) {
PRInt32 result=kNoError;
mSink->WillResume();
int iter=0;
PRInt32 errcode=mTokenizer->TokenizeAvailable(iter);
if(kInterrupted==errcode)
mSink->WillInterrupt();
PRBool result=IterateTokens();
if(kNoError==result) {
result=mTokenizer->Tokenize(anIteration);
if(kInterrupted==result)
mSink->WillInterrupt();
if(!rickGDebug)
IterateTokens();
}
return result;
}
/**
*
* @update gess4/22/98
@ -1388,5 +1501,64 @@ PRBool nsHTMLParser::ReduceContextStackFor(PRInt32 aChildTag){
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::GetBindInfo(void){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnProgress(PRInt32 Progress, PRInt32 ProgressMax, const char *msg){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnStartBinding(void){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnDataAvailable(nsIInputStream *pIStream, PRInt32 length){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnStopBinding(void){
nsresult result=0;
return result;
}

View File

@ -61,6 +61,9 @@
#include "nsParserNode.h"
#include "nsTokenHandler.h"
#include "nsParserTypes.h"
#include "nsIURL.h"
#include "nsIStreamListener.h"
#include "nsITokenizerDelegate.h"
#define NS_IHTML_PARSER_IID \
@ -75,7 +78,7 @@ class nsIURL;
class nsIDTD;
class nsHTMLParser : public nsIParser {
class nsHTMLParser : public nsIParser, public nsIStreamListener {
public:
friend class CTokenHandler;
@ -104,14 +107,6 @@ friend class CTokenHandler;
*/
virtual nsIContentSink* SetContentSink(nsIContentSink* aSink);
/**
* Cause parser to parse input from given URL
* @update gess5/11/98
* @param aURL is a descriptor for source document
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRBool Parse(nsIURL* aURL);
/**
* Cause parser to parse input from given URL in given mode
* @update gess5/11/98
@ -119,14 +114,31 @@ friend class CTokenHandler;
* @param aMode is the desired parser mode (Nav, other, etc.)
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRBool Parse(nsIURL* aURL,eParseMode aMode);
virtual PRInt32 Parse(nsIURL* aURL,PRBool aIncremental=PR_FALSE);
/**
* Cause parser to parse input from given file in given mode
* @update gess5/11/98
* @param aFilename is a path for file document
* @param aMode is the desired parser mode (Nav, other, etc.)
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRInt32 Parse(const char* aFilename,PRBool aIncremental);
/**
* @update gess5/11/98
* @param anHTMLString contains a string-full of real HTML
* @param appendTokens tells us whether we should insert tokens inline, or append them.
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRInt32 Parse(nsString& anHTMLString,PRBool appendTokens);
/**
* This method gets called (automatically) during incremental parsing
* @update gess5/11/98
* @return TRUE if all went well, otherwise FALSE
*/
virtual PRBool ResumeParse();
virtual PRInt32 ResumeParse(PRInt32 anIteration);
/**
* Retrieve ptr to internal context vector stack
@ -230,6 +242,15 @@ friend class CTokenHandler;
*/
PRBool HandleStyleToken(CToken* aToken);
//*********************************************
// These methods are callback methods used by
// net lib to let us know about our inputstream.
//*********************************************
NS_IMETHOD GetBindInfo(void);
NS_IMETHOD OnProgress(PRInt32 Progress, PRInt32 ProgressMax, const char *msg);
NS_IMETHOD OnStartBinding(void);
NS_IMETHOD OnDataAvailable(nsIInputStream *pIStream, PRInt32 length);
NS_IMETHOD OnStopBinding(void);
protected:
@ -485,6 +506,10 @@ protected:
*/
PRBool CreateContextStackFor(PRInt32 aChildTag);
private:
PRInt32 ParseFileIncrementally(const char* aFilename); //XXX ONLY FOR DEBUG PURPOSES...
protected:
//*********************************************
// And now, some data members...
//*********************************************
@ -502,6 +527,8 @@ protected:
nsIDTD* mDTD;
eParseMode mParseMode;
PRBool mHasOpenForm;
PRBool mIncremental;
ITokenizerDelegate* mDelegate;
};

View File

@ -40,6 +40,7 @@
class nsIContentSink;
class nsString;
/**
* This class defines the iparser interface. This XPCOM
@ -52,10 +53,14 @@ class nsIParser : public nsISupports {
public:
virtual nsIContentSink* SetContentSink(nsIContentSink* aContentSink)=0;
virtual PRBool Parse(nsIURL* aURL)=0;
virtual PRBool ResumeParse()=0;
virtual PRInt32 GetStack(PRInt32* aStackPtr)=0;
virtual PRBool HasOpenContainer(PRInt32 aContainer) const=0;
virtual PRInt32 Parse(nsIURL* aURL,PRBool aIncremental=PR_FALSE)=0;
virtual PRInt32 Parse(const char* aFilename,PRBool aIncremental)=0;
virtual PRInt32 Parse(nsString& anHTMLString,PRBool appendTokens)=0;
virtual PRInt32 ResumeParse(PRInt32 anIterator)=0;
virtual PRInt32 GetStack(PRInt32* aStackPtr)=0;
virtual PRBool HasOpenContainer(PRInt32 aContainer) const=0;
};
extern NS_HTMLPARS nsresult NS_NewHTMLParser(nsIParser** aInstancePtrResult);

View File

@ -37,13 +37,16 @@ enum eParseMode {
eParseMode_unknown=0,
eParseMode_navigator,
eParseMode_other
eParseMode_other,
eParseMode_autodetect
};
const PRInt32 kEOF = 1000000L;
const PRInt32 kBadFilename = -4;
const PRInt32 kBadURL = -3;
const PRInt32 kInterrupted = -2;
const PRInt32 kNotFound = -1;
const PRInt32 kNoError = 0;
const PRInt32 kInterrupted = 2;
const PRUint32 kNewLine = '\n';
const PRUint32 kCR = '\r';

View File

@ -22,9 +22,8 @@
#include "nsIURL.h"
#include "nsDebug.h"
const char* gURLRef;
const char* kBadHTMLText1="<HTML><BODY><H3>Oops...</H3>You just tried to read a non-existent document: <BR>";
const char* kBadHTMLText2="</BODY></HTML>";
const char* gURLRef=0;
const char* kBadHTMLText="<H3>Oops...</H3>You just tried to read a non-existent document: <BR>";
#ifdef __INCREMENTAL
const int kBufsize=1;
@ -33,31 +32,63 @@ const int kBufsize=64;
#endif
/**
* default constructor
*
* @update gess 3/25/98
* @param aURL -- pointer to URL to be loaded
* Use this constructor if you want an incremental (callback)
* based input stream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner::CScanner(eParseMode aMode) : mBuffer("") {
mOffset=0;
mMarkPos=-1;
mTotalRead=0;
mParseMode=aMode;
mNetStream=0;
mFileStream=0;
mIncremental=PR_TRUE;
}
/**
* Use this constructor if you want i/o to be file based.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner::CScanner(const char* aFilename,eParseMode aMode) : mBuffer("") {
NS_ASSERTION(0!=aFilename,"Error: Null filename!");
mOffset=0;
mMarkPos=-1;
mTotalRead=0;
mParseMode=aMode;
mNetStream=0;
mIncremental=PR_FALSE;
mFileStream=new fstream(aFilename,ios::in|ios::binary);
}
/**
* Use this constructor if you want i/o to be based on a
* non-incremental netstream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner::CScanner(nsIURL* aURL,eParseMode aMode) : mBuffer("") {
NS_ASSERTION(0!=aURL,"Error: Null URL!");
mOffset=0;
mStream=0;
mMarkPos=-1;
mTotalRead=0;
mParseMode=aMode;
if(aURL) {
gURLRef=aURL->GetSpec();
#ifdef __INCREMENTAL
mStream=new fstream("c:/temp/temp.html",ios::in|ios::binary);
#else
int error;
mStream=aURL->Open(&error);
#endif
}
mFileStream=0;
PRInt32 error=0;
mIncremental=PR_FALSE;
mNetStream=aURL->Open(&error);
gURLRef=aURL->GetSpec();
}
/**
* default destructor
*
@ -66,19 +97,107 @@ CScanner::CScanner(nsIURL* aURL,eParseMode aMode) : mBuffer("") {
* @return
*/
CScanner::~CScanner() {
#ifdef __INCREMENTAL
mStream->close();
delete mStream;
mStream=0;
#else
if(mStream) {
mStream->Close();
mStream->Release();
mStream=0;
if(mFileStream) {
mFileStream->close();
delete mFileStream;
}
#endif
else if(mNetStream) {
mNetStream->Close();
mNetStream->Release();
}
mFileStream=0;
mNetStream=0;
gURLRef=0;
}
/**
* Resets current offset position of input stream to marked position.
* This allows us to back up to this point if the need should arise,
* such as when tokenization gets interrupted.
* NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 CScanner::RewindToMark(void){
mOffset=mMarkPos;
return mOffset;
}
/**
* Records current offset position in input stream. This allows us
* to back up to this point if the need should arise, such as when
* tokenization gets interrupted.
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 CScanner::Mark(void){
mMarkPos=mOffset;
return mMarkPos;
}
/**
*
* @update gess 5/12/98
*/
void _PreCompressBuffer(nsString& aBuffer,PRInt32& anOffset,PRInt32& aMarkPos){
//To determine how much of our internal buffer to truncate,
//we should check mMarkPos. That represents the point at which
//we've guaranteed the client we can back up to, so make sure
//you don't lose any of the data beyond that point.
if((anOffset!=aMarkPos) && (0<=aMarkPos)) {
if(aMarkPos>0) {
aBuffer.Cut(0,aMarkPos);
if(anOffset>aMarkPos)
anOffset-=aMarkPos;
}
}
else aBuffer.Truncate();
aMarkPos=0;
}
/**
* This method should only be called by the parser when
* we're doing incremental i/o over the net.
*
* @update gess 5/12/98
* @param aBuffer contains next blob of i/o data
* @param aSize contains size of buffer
* @return 0 if all went well, otherwise error code.
*/
PRInt32 CScanner::IncrementalAppend(const char* aBuffer,PRInt32 aSize){
NS_ASSERTION(((!mFileStream) && (!mNetStream)),"Error: Should only be called during incremental net i/o!");
PRInt32 result=0;
if((!mFileStream) && (!mNetStream)) {
_PreCompressBuffer(mBuffer,mOffset,mMarkPos);
//now that the buffer is (possibly) shortened, let's append the new data.
if(0<aSize) {
mBuffer.Append(aBuffer,aSize);
mTotalRead+=aSize;
}
}
return result;
}
/**
* Grab data from underlying stream.
*
* @update gess4/3/98
* @return error code
*/
PRBool CScanner::Append(nsString& aBuffer) {
_PreCompressBuffer(mBuffer,mOffset,mMarkPos);
mBuffer.Append(aBuffer);
return PR_TRUE;
}
/**
* Grab data from underlying stream.
@ -89,55 +208,63 @@ CScanner::~CScanner() {
PRInt32 CScanner::FillBuffer(void) {
PRInt32 anError=0;
mBuffer.Truncate();
if(!mStream) {
_PreCompressBuffer(mBuffer,mOffset,mMarkPos);
if((!mIncremental) && (!mNetStream) && (!mFileStream)) {
//This is DEBUG code!!!!!! XXX DEBUG XXX
//If you're here, it means someone tried to load a
//non-existent document. So as a favor, we emit a
//little bit of HTML explaining the error.
if(0==mTotalRead) {
mBuffer.Append((const char*)kBadHTMLText1);
mBuffer.Append((const char*)kBadHTMLText);
mBuffer.Append((const char*)gURLRef);
mBuffer.Append((const char*)kBadHTMLText2);
}
else return 0;
}
else {
else if(!mIncremental) {
PRInt32 numread=0;
char buf[kBufsize+1];
buf[kBufsize]=0;
#ifdef __INCREMENTAL
mStream->read(buf,kBufsize);
numread=mStream->gcount();
#else
numread=mStream->Read(&anError,buf,0,kBufsize);
#endif
if(mFileStream) {
mFileStream->read(buf,kBufsize);
numread=mFileStream->gcount();
}
else if(mNetStream) {
numread=mNetStream->Read(&anError,buf,0,kBufsize);
if(1==anError)
anError=kEOF;
}
mOffset=mBuffer.Length();
if((0<numread) && (0==anError))
mBuffer.Append((const char*)buf,numread);
mTotalRead+=mBuffer.Length();
}
mTotalRead+=mBuffer.Length();
else anError=kInterrupted;
return anError;
}
/**
* determine if the scanner has reached EOF
*
* @update gess 3/25/98
* @update gess 5/12/98
* @param
* @return PR_TRUE upon eof condition
* @return 0=!eof 1=eof kInterrupted=interrupted
*/
PRBool CScanner::Eof() {
PRInt32 CScanner::Eof() {
PRInt32 theError=0;
if(mOffset>=mBuffer.Length()) {
theError=FillBuffer();
mOffset=0;
if(!mIncremental)
theError=FillBuffer();
else return kInterrupted;
}
PRBool result=PR_TRUE;
if(0==theError) {
result=PRBool(0==mBuffer.Length());
}
return result;
if(0==theError)
return (0==mBuffer.Length());
return theError;
}
/**
@ -148,11 +275,12 @@ PRBool CScanner::Eof() {
* @return error code reflecting read status
*/
PRInt32 CScanner::GetChar(PRUnichar& aChar) {
if(!Eof()) {
PRInt32 result=Eof();
if(!result) {
aChar=mBuffer[mOffset++];
return kNoError;
result=kNoError;
}
return kEOF;
return result;
}
@ -165,11 +293,12 @@ PRInt32 CScanner::GetChar(PRUnichar& aChar) {
* @return
*/
PRInt32 CScanner::Peek(PRUnichar& aChar){
if(!Eof()) {
PRInt32 result=Eof();
if(!result) {
aChar=mBuffer[mOffset];
return kNoError;
result=kNoError;
}
return kEOF;
return result;
}
@ -181,7 +310,9 @@ PRInt32 CScanner::Peek(PRUnichar& aChar){
* @return error code
*/
PRInt32 CScanner::PutBack(PRUnichar aChar) {
mOffset--;
if(mOffset>0)
mOffset--;
else mBuffer.Insert(aChar,0);
return kNoError;
}
@ -301,8 +432,8 @@ PRInt32 CScanner::ReadUntil(nsString& aString,nsString& aTerminalSet,PRBool addT
PRUnichar ch=0;
PRInt32 result=kNoError;
while(!Eof()) {
result=GetChar(ch);
while(!result) {
result=GetChar(ch);
if(kNoError==result) {
PRInt32 pos=aTerminalSet.Find(ch);
if(kNotFound!=pos) {

View File

@ -43,7 +43,36 @@ class ifstream;
class CScanner {
public:
/**
* Use this constructor if you want an incremental (callback)
* based input stream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner(eParseMode aMode=eParseMode_navigator);
/**
* Use this constructor if you want i/o to be based on a
* non-incremental netstream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner(nsIURL* aURL,eParseMode aMode=eParseMode_navigator);
/**
* Use this constructor if you want i/o to be file based.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner(const char* aFilename,eParseMode aMode=eParseMode_navigator);
~CScanner();
/**
@ -117,7 +146,7 @@ class CScanner {
* @update gess 3/25/98
* @return PR_TRUE upon eof condition
*/
PRBool Eof(void);
PRInt32 Eof(void);
/**
* Consume characters until you find the terminal char
@ -153,6 +182,48 @@ class CScanner {
*/
PRInt32 ReadWhile(nsString& aString,nsString& anInputSet,PRBool addTerminal);
/**
* Records current offset position in input stream. This allows us
* to back up to this point if the need should arise, such as when
* tokenization gets interrupted.
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 Mark(void);
/**
* Resets current offset position of input stream to marked position.
* This allows us to back up to this point if the need should arise,
* such as when tokenization gets interrupted.
* NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 RewindToMark(void);
/**
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool Append(nsString& aBuffer);
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 IncrementalAppend(const char* aBuffer,PRInt32 aSize);
static void SelfTest();
protected:
@ -167,15 +238,15 @@ class CScanner {
*/
PRInt32 FillBuffer(void);
#ifdef __INCREMENTAL
fstream* mStream;
#else
nsIInputStream* mStream;
#endif
fstream* mFileStream;
nsIInputStream* mNetStream;
nsString mBuffer;
PRInt32 mOffset;
PRInt32 mMarkPos;
PRInt32 mTotalRead;
eParseMode mParseMode;
PRBool mIncremental;
};
#endif

View File

@ -39,6 +39,35 @@ CTokenizer::CTokenizer(nsIURL* aURL,ITokenizerDelegate* aDelegate,eParseMode aMo
mParseMode=aMode;
}
/**
* Default constructor
*
* @update gess 3/25/98
* @param aFilename -- name of file to be tokenized
* @param aDelegate -- ref to delegate to be used to tokenize
* @return
*/
CTokenizer::CTokenizer(const char* aFilename,ITokenizerDelegate* aDelegate,eParseMode aMode) :
mTokenDeque() {
mDelegate=aDelegate;
mScanner=new CScanner(aFilename,aMode);
mParseMode=aMode;
}
/**
* Default constructor
*
* @update gess 3/25/98
* @param aFilename -- name of file to be tokenized
* @param aDelegate -- ref to delegate to be used to tokenize
* @return
*/
CTokenizer::CTokenizer(ITokenizerDelegate* aDelegate,eParseMode aMode) :
mTokenDeque() {
mDelegate=aDelegate;
mScanner=new CScanner(aMode);
mParseMode=aMode;
}
/**
* default destructor
@ -54,6 +83,19 @@ CTokenizer::~CTokenizer() {
}
/**
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool CTokenizer::Append(nsString& aBuffer) {
if(mScanner)
return mScanner->Append(aBuffer);
return PR_FALSE;
}
/**
* Retrieve a reference to the internal token deque.
*
@ -105,31 +147,31 @@ PRBool CTokenizer::WillTokenize(PRBool aIncremental){
}
/**
* This is the primary control routine. It iteratively
* consumes tokens until an error occurs or you run out
* of data.
*
* @update gess 3/25/98
* @return error code
* @return TRUE if it's ok to proceed
*/
PRInt32 CTokenizer::Tokenize(void) {
PRInt32 CTokenizer::Tokenize(nsString& aSourceBuffer,PRBool appendTokens){
CToken* theToken=0;
PRInt32 result=kNoError;
WillTokenize(PR_TRUE);
while(kNoError==result) {
result=GetToken(theToken);
if(theToken && (kNoError==result)) {
if(WillTokenize(PR_FALSE)) {
do {
result=GetToken(theToken);
if(theToken) {
#ifdef VERBOSE_DEBUG
theToken->DebugDumpToken(cout);
#endif
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
}
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
}
} while(0!=theToken);
result=DidTokenize(PR_FALSE);
}
}
}
if(kEOF==result)
result=kNoError;
DidTokenize(PR_TRUE);
return result;
}
@ -141,20 +183,33 @@ PRInt32 CTokenizer::Tokenize(void) {
* @update gess 3/25/98
* @return error code
*/
PRInt32 CTokenizer::TokenizeAvailable(int anIteration) {
PRInt32 CTokenizer::Tokenize(int anIteration) {
CToken* theToken=0;
PRInt32 result=kNoError;
PRBool done=(0==anIteration) ? (!WillTokenize(PR_TRUE)) : PR_FALSE;
while((PR_FALSE==done) && (kInterrupted!=kInterrupted)) {
while((PR_FALSE==done) && (kNoError==result)) {
mScanner->Mark();
result=GetToken(theToken);
if(theToken) {
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
if(kNoError==result) {
if(theToken) {
#ifdef VERBOSE_DEBUG
theToken->DebugDumpToken(cout);
#endif
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
}
}
}
else {
if(theToken)
delete theToken;
mScanner->RewindToMark();
}
else done=PR_TRUE;
}
if((PR_TRUE==done) && (kInterrupted!=result))
DidTokenize(PR_TRUE);

View File

@ -50,19 +50,13 @@ class nsIURL;
class CTokenizer {
public:
CTokenizer(nsIURL* aURL,ITokenizerDelegate* aDelegate,eParseMode aMode);
CTokenizer(ITokenizerDelegate* aDelegate,eParseMode aMode=eParseMode_navigator);
CTokenizer(const char* aFilename,ITokenizerDelegate* aDelegate,eParseMode aMode=eParseMode_navigator);
CTokenizer(nsIURL* aURL,ITokenizerDelegate* aDelegate,eParseMode aMode=eParseMode_navigator);
~CTokenizer();
/**
* This control routine causes the entire stream to be
* tokenized. You probably want to call TokenizeAvailable()
* instead (for incremental tokenization).
*
* @update gess 3/25/98
* @return TRUE if it's ok to proceed
*/
PRInt32 Tokenize(void);
/**
* This method incrementally tokenizes as much content as
* it can get its hands on.
@ -70,7 +64,14 @@ class CTokenizer {
* @update gess 3/25/98
* @return TRUE if it's ok to proceed
*/
PRInt32 TokenizeAvailable(int anIteration); //your friendly incremental version
PRInt32 Tokenize(int anIteration); //your friendly incremental version
/**
*
* @update gess 3/25/98
* @return TRUE if it's ok to proceed
*/
PRInt32 Tokenize(nsString& aSourceBuffer,PRBool appendTokens=PR_TRUE);
/**
* Cause the tokenizer to consume the next token, and
@ -98,6 +99,23 @@ class CTokenizer {
*/
nsDeque& GetDeque(void);
/**
*
* @update gess 4/20/98
* @return deque reference
*/
PRBool Append(nsString& aBuffer);
/**
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool SetBuffer(nsString& aBuffer);
/**
* This debug routine is used to cause the tokenizer to
* iterate its token list, asking each token to dump its

View File

@ -81,42 +81,49 @@ nsIDTD* CNavDelegate::GetDTD(void) const{
* and we know we're at the start of some kind of tagged
* element. We don't know yet if it's a tag or a comment.
*
* @update gess 3/25/98
* @param
* @return
* @update gess 5/12/98
* @param aChar is the last char read
* @param aScanner is represents our input source
* @param aToken is the out arg holding our new token
* @return error code (may return kInterrupted).
*/
PRInt32 CNavDelegate::ConsumeTag(PRUnichar aChar,CScanner& aScanner,CToken*& aToken) {
nsAutoString empty("");
PRInt32 result=aScanner.GetChar(aChar);
switch(aChar) {
case kForwardSlash:
PRUnichar ch;
result=aScanner.Peek(ch);
if(nsString::IsAlpha(ch))
aToken=new CEndToken(empty);
else aToken=new CCommentToken(empty); //Special case: </ ...> is treated as a comment
break;
case kExclamation:
aToken=new CCommentToken(empty);
break;
default:
if(nsString::IsAlpha(aChar))
return ConsumeStartTag(aChar,aScanner,aToken);
else if(kEOF!=aChar) {
nsAutoString temp("<");
return ConsumeText(temp,aScanner,aToken);
}
} //switch
if(kNoError==result) {
if(0!=aToken) {
result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(result) {
delete aToken;
aToken=0;
}
}
switch(aChar) {
case kForwardSlash:
PRUnichar ch;
result=aScanner.Peek(ch);
if(kNoError==result) {
if(nsString::IsAlpha(ch))
aToken=new CEndToken(empty);
else aToken=new CCommentToken(empty); //Special case: </ ...> is treated as a comment
}//if
break;
case kExclamation:
aToken=new CCommentToken(empty);
break;
default:
if(nsString::IsAlpha(aChar))
return ConsumeStartTag(aChar,aScanner,aToken);
else if(kEOF!=aChar) {
nsAutoString temp("<");
return ConsumeText(temp,aScanner,aToken);
}
} //switch
if((0!=aToken) && (kNoError==result)) {
result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(result) {
delete aToken;
aToken=0;
}
} //if
} //if
return result;
}
@ -131,20 +138,26 @@ PRInt32 CNavDelegate::ConsumeTag(PRUnichar aChar,CScanner& aScanner,CToken*& aTo
*/
PRInt32 CNavDelegate::ConsumeAttributes(PRUnichar aChar,CScanner& aScanner) {
PRBool done=PR_FALSE;
nsAutoString as("");
PRInt32 result=kNoError;
nsAutoString as("");
while((!done) && (result==kNoError)) {
CToken* theToken= new CAttributeToken(as);
if(theToken){
result= theToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
CToken* theToken= new CAttributeToken(as);
if(theToken){
result=theToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(kNoError==result){
mTokenDeque.Push(theToken);
}
aScanner.Peek(aChar);
if(aChar==kGreaterThan) { //you just ate the '>'
aScanner.GetChar(aChar); //skip the '>'
done=PR_TRUE;
}
}
}//if
}//if
if(kNoError==result){
result=aScanner.Peek(aChar);
if(aChar==kGreaterThan) { //you just ate the '>'
aScanner.GetChar(aChar); //skip the '>'
done=PR_TRUE;
}//if
}//if
}//while
return result;
}
@ -166,8 +179,7 @@ PRInt32 CNavDelegate::ConsumeContentToEndTag(const nsString& aString,PRUnichar a
endTag.Append(aString);
endTag.Append(">");
aToken=new CSkippedContentToken(endTag);
PRInt32 result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
return result;
return aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
}
/**
@ -183,38 +195,43 @@ PRInt32 CNavDelegate::ConsumeContentToEndTag(const nsString& aString,PRUnichar a
PRInt32 CNavDelegate::ConsumeStartTag(PRUnichar aChar,CScanner& aScanner,CToken*& aToken) {
aToken=new CStartToken(nsAutoString(""));
PRInt32 result=kNoError;
if(aToken) {
result= aToken->Consume(aChar,aScanner); //tell new token to finish consuming text...
if(((CStartToken*)aToken)->IsAttributed()) {
result=ConsumeAttributes(aChar,aScanner);
}
//now that that's over with, we have one more problem to solve.
//In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
//consume all the content itself.
nsString& str=aToken->GetText();
CToken* skippedToken=0;
if(str.EqualsIgnoreCase("SCRIPT") ||
str.EqualsIgnoreCase("STYLE") ||
str.EqualsIgnoreCase("TITLE") ||
str.EqualsIgnoreCase("TEXTAREA")) {
result=ConsumeContentToEndTag(str,aChar,aScanner,skippedToken);
if(skippedToken){
//now we strip the ending sequence from our new SkippedContent token...
PRInt32 slen=str.Length()+3;
nsString& skippedText=skippedToken->GetText();
skippedText.Cut(skippedText.Length()-slen,slen);
mTokenDeque.Push(skippedToken);
if(kNoError==result) {
if(((CStartToken*)aToken)->IsAttributed()) {
result=ConsumeAttributes(aChar,aScanner);
}
//now that that's over with, we have one more problem to solve.
//In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
//consume all the content itself.
if(kNoError==result) {
nsString& str=aToken->GetText();
CToken* skippedToken=0;
if(str.EqualsIgnoreCase("SCRIPT") ||
str.EqualsIgnoreCase("STYLE") ||
str.EqualsIgnoreCase("TITLE") ||
str.EqualsIgnoreCase("TEXTAREA")) {
result=ConsumeContentToEndTag(str,aChar,aScanner,skippedToken);
//In the case that we just read a given tag, we should go and
//consume all the tag content itself (and throw it all away).
if((kNoError==result) && skippedToken){
//now we strip the ending sequence from our new SkippedContent token...
PRInt32 slen=str.Length()+3;
nsString& skippedText=skippedToken->GetText();
skippedText.Cut(skippedText.Length()-slen,slen);
mTokenDeque.Push(skippedToken);
//In the case that we just read a given tag, we should go and
//consume all the tag content itself (and throw it all away).
CEndToken* endtoken=new CEndToken(str);
mTokenDeque.Push(endtoken);
CEndToken* endtoken=new CEndToken(str);
mTokenDeque.Push(endtoken);
} //if
} //if
} //if
} //if
}
} //if
return result;
}
@ -231,19 +248,22 @@ PRInt32 CNavDelegate::ConsumeStartTag(PRUnichar aChar,CScanner& aScanner,CToken*
PRInt32 CNavDelegate::ConsumeEntity(PRUnichar aChar,CScanner& aScanner,CToken*& aToken) {
PRUnichar ch;
PRInt32 result=aScanner.GetChar(ch);
if(nsString::IsAlpha(ch)) { //handle common enity references &xxx; or &#000.
aToken = new CEntityToken(nsAutoString(""));
result = aToken->Consume(ch,aScanner); //tell new token to finish consuming text...
}
else if(kHashsign==ch) {
aToken = new CEntityToken(nsAutoString(""));
result=aToken->Consume(0,aScanner);
}
else {
//oops, we're actually looking at plain text...
nsAutoString temp("&");
result=ConsumeText(temp,aScanner,aToken);
}
if(kNoError==result) {
if(nsString::IsAlpha(ch)) { //handle common enity references &xxx; or &#000.
aToken = new CEntityToken(nsAutoString(""));
result = aToken->Consume(ch,aScanner); //tell new token to finish consuming text...
}
else if(kHashsign==ch) {
aToken = new CEntityToken(nsAutoString(""));
result=aToken->Consume(0,aScanner);
}
else {
//oops, we're actually looking at plain text...
nsAutoString temp("&");
result=ConsumeText(temp,aScanner,aToken);
}
}//if
return result;
}
@ -336,36 +356,54 @@ PRInt32 CNavDelegate::ConsumeNewline(PRUnichar aChar,CScanner& aScanner,CToken*&
* @return new token or null
*/
PRInt32 CNavDelegate::GetToken(CScanner& aScanner,CToken*& aToken){
PRInt32 result=kNoError;
PRUnichar aChar;
aToken=0;
if(mTokenDeque.GetSize()>0) {
aToken=(CToken*)mTokenDeque.Pop();
return result;
return kNoError;
}
aToken=0;
while(!aScanner.Eof()) {
PRInt32 result=kNoError;
if(kNoError==result){
PRUnichar aChar;
result=aScanner.GetChar(aChar);
switch(aChar) {
case kAmpersand:
return ConsumeEntity(aChar,aScanner,aToken);
case kLessThan:
return ConsumeTag(aChar,aScanner,aToken);
case kCR: case kLF:
return ConsumeNewline(aChar,aScanner,aToken);
case kNotFound:
switch(result) {
case kEOF:
break;
case kInterrupted:
aScanner.RewindToMark();
break;
case kNoError:
default:
if(!nsString::IsSpace(aChar)) {
nsAutoString temp(aChar);
return ConsumeText(temp,aScanner,aToken);
}
else return ConsumeWhitespace(aChar,aScanner,aToken);
break;
switch(aChar) {
case kLessThan:
return ConsumeTag(aChar,aScanner,aToken);
case kAmpersand:
return ConsumeEntity(aChar,aScanner,aToken);
case kCR: case kLF:
return ConsumeNewline(aChar,aScanner,aToken);
case kNotFound:
break;
default:
if(!nsString::IsSpace(aChar)) {
nsAutoString temp(aChar);
return ConsumeText(temp,aScanner,aToken);
}
else return ConsumeWhitespace(aChar,aScanner,aToken);
break;
} //switch
break;
} //switch
if(result==kEOF)
result=0;
} //while
if(kNoError==result)
result=aScanner.Eof();
} //while
return result;
}

View File

@ -31,6 +31,7 @@
#include "CNavDTD.h"
#include "prenv.h" //this is here for debug reasons...
#include "plstr.h"
#include <fstream.h>
#ifdef XP_PC
#include <direct.h> //this is here for debug reasons...
#endif
@ -40,11 +41,13 @@ static NS_DEFINE_IID(kClassIID, NS_IHTML_PARSER_IID);
static NS_DEFINE_IID(kIParserIID, NS_IPARSER_IID);
static const char* kNullURL = "Error: Null URL given";
static const char* kNullFilename= "Error: Null filename given";
static const char* kNullTokenizer = "Error: Unable to construct tokenizer";
static const char* kNullToken = "Error: Null token given";
static const char* kInvalidTagStackPos = "Error: invalid tag stack position";
static char* gVerificationOutputDir=0;
static char* gVerificationOutputDir=0;
static int rickGDebug=0;
/**
* This method is defined in nsIParser. It is used to
@ -456,92 +459,198 @@ PRBool nsHTMLParser::IterateTokens() {
return result;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool nsHTMLParser::Parse(nsIURL* aURL){
eParseMode theMode=eParseMode_navigator;
eParseMode DetermineParseMode() {
const char* theModeStr= PR_GetEnv("PARSE_MODE");
const char* other="other";
eParseMode result=eParseMode_navigator;
if(theModeStr)
if(0==nsCRT::strcasecmp(other,theModeStr))
theMode=eParseMode_other;
return Parse(aURL,theMode);
result=eParseMode_other;
return result;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool nsHTMLParser::Parse(nsIURL* aURL,eParseMode aMode){
NS_PRECONDITION(0!=aURL,kNullURL);
PRBool result=PR_FALSE;
if(aURL) {
void GetDelegateAndDTD(eParseMode aMode,ITokenizerDelegate*& aDelegate,nsIDTD*& aDTD) {
switch(aMode) {
case eParseMode_navigator:
aDelegate=new CNavDelegate(); break;
case eParseMode_other:
aDelegate=new COtherDelegate(); break;
default:
break;
}
if(aDelegate)
aDTD=aDelegate->GetDTD();
}
result=PR_TRUE;
mParseMode=aMode;
ITokenizerDelegate* theDelegate=0;
mDTD=0;
switch(mParseMode) {
case eParseMode_navigator:
theDelegate=new CNavDelegate();
if(theDelegate)
mDTD=theDelegate->GetDTD();
break;
case eParseMode_other:
theDelegate=new COtherDelegate();
if(theDelegate)
mDTD=theDelegate->GetDTD();
break;
default:
break;
}
if(!theDelegate) {
NS_ERROR(kNullTokenizer);
return PR_FALSE;
}
if(mDTD)
mDTD->SetParser(this);
mTokenizer=new CTokenizer(aURL, theDelegate, mParseMode);
/**
* This DEBUG ONLY method is used to simulate a network-based
* i/o model where data comes in incrementally.
*
* @update gess 5/13/98
* @param aFilename is the name of the disk file to use for testing.
* @return error code (kNoError means ok)
*/
PRInt32 nsHTMLParser::ParseFileIncrementally(const char* aFilename){
PRInt32 result=kBadFilename;
fstream* mFileStream;
nsString theBuffer;
PRInt32 iter=-1;
const int kBufSize=10;
mFileStream=new fstream(aFilename,ios::in|ios::binary);
if(mFileStream) {
result=kNoError;
while((kNoError==result) || (kInterrupted==result)) {
//read some data from the file...
char buf[kBufSize];
buf[kBufSize]=0;
if(mFileStream) {
mFileStream->read(buf,kBufSize);
PRInt32 numread=mFileStream->gcount();
if(numread>0) {
theBuffer.Truncate();
theBuffer.Append(buf);
mTokenizer->Append(theBuffer);
result=ResumeParse(++iter);
}
}
mSink->WillBuildModel();
#ifdef __INCREMENTAL
int iter=-1;
for(;;){
mSink->WillResume();
mTokenizer->TokenizeAvailable(++iter);
mSink->WillInterrupt();
}
#else
mTokenizer->Tokenize();
#endif
result=IterateTokens();
mSink->DidBuildModel();
mFileStream->close();
delete mFileStream;
}
return result;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*/
PRBool nsHTMLParser::Parse(const char* aFilename,PRBool aIncremental){
NS_PRECONDITION(0!=aFilename,kNullFilename);
PRInt32 status=kBadFilename;
mIncremental=aIncremental;
mParseMode=DetermineParseMode();
if(aFilename) {
GetDelegateAndDTD(mParseMode,mDelegate,mDTD);
if(mDelegate) {
if(mDTD)
mDTD->SetParser(this);
mSink->WillBuildModel();
//ok, time to create our tokenizer and begin the process
if(aIncremental) {
mTokenizer=new CTokenizer(mDelegate,mParseMode);
status=ParseFileIncrementally(aFilename);
}
else {
//ok, time to create our tokenizer and begin the process
mTokenizer=new CTokenizer(aFilename,mDelegate,mParseMode);
status=ResumeParse(0);
}
mSink->DidBuildModel();
}//if
}
return status;
}
/**
* This is the main controlling routine in the parsing process.
* Note that it may get called multiple times for the same scanner,
* since this is a pushed based system, and all the tokens may
* not have been consumed by the scanner during a given invocation
* of this method.
*
* @update gess 3/25/98
* @param aFilename -- const char* containing file to be parsed.
* @return PR_TRUE if parse succeeded, PR_FALSE otherwise.
*/
PRInt32 nsHTMLParser::Parse(nsIURL* aURL,PRBool aIncremental ){
NS_PRECONDITION(0!=aURL,kNullURL);
PRInt32 status=kBadURL;
if(rickGDebug)
status=Parse("c:/temp/temp.html",PR_TRUE);
mIncremental=aIncremental;
mParseMode=DetermineParseMode();
if(aURL) {
GetDelegateAndDTD(mParseMode,mDelegate,mDTD);
if(mDelegate) {
if(mDTD)
mDTD->SetParser(this);
mSink->WillBuildModel();
//ok, time to create our tokenizer and begin the process
if(aIncremental) {
mTokenizer=new CTokenizer(mDelegate,mParseMode);
status=aURL->Open(this);
}
else {
mTokenizer=new CTokenizer(aURL,mDelegate,mParseMode);
status=ResumeParse(0);
mSink->DidBuildModel();
}
}//if
}
return status;
}
/**
* Call this method if all you want to do is parse 1 string full of HTML text.
*
* @update gess5/11/98
* @param anHTMLString contains a string-full of real HTML
* @param appendTokens tells us whether we should insert tokens inline, or append them.
* @return TRUE if all went well -- FALSE otherwise
*/
PRInt32 nsHTMLParser::Parse(nsString& aSourceBuffer,PRBool appendTokens){
PRInt32 result=kNoError;
mSink->WillBuildModel();
mTokenizer->Append(aSourceBuffer);
result=ResumeParse(0);
mSink->DidBuildModel();
return result;
}
/**
* This routine is called to cause the parser to continue
@ -553,17 +662,21 @@ PRBool nsHTMLParser::Parse(nsIURL* aURL,eParseMode aMode){
* @param
* @return PR_TRUE if parsing concluded successfully.
*/
PRBool nsHTMLParser::ResumeParse() {
PRInt32 nsHTMLParser::ResumeParse(PRInt32 anIteration) {
PRInt32 result=kNoError;
mSink->WillResume();
int iter=0;
PRInt32 errcode=mTokenizer->TokenizeAvailable(iter);
if(kInterrupted==errcode)
mSink->WillInterrupt();
PRBool result=IterateTokens();
if(kNoError==result) {
result=mTokenizer->Tokenize(anIteration);
if(kInterrupted==result)
mSink->WillInterrupt();
if(!rickGDebug)
IterateTokens();
}
return result;
}
/**
*
* @update gess4/22/98
@ -1388,5 +1501,64 @@ PRBool nsHTMLParser::ReduceContextStackFor(PRInt32 aChildTag){
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::GetBindInfo(void){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnProgress(PRInt32 Progress, PRInt32 ProgressMax, const char *msg){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnStartBinding(void){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnDataAvailable(nsIInputStream *pIStream, PRInt32 length){
nsresult result=0;
return result;
}
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
nsresult nsHTMLParser::OnStopBinding(void){
nsresult result=0;
return result;
}

View File

@ -61,6 +61,9 @@
#include "nsParserNode.h"
#include "nsTokenHandler.h"
#include "nsParserTypes.h"
#include "nsIURL.h"
#include "nsIStreamListener.h"
#include "nsITokenizerDelegate.h"
#define NS_IHTML_PARSER_IID \
@ -75,7 +78,7 @@ class nsIURL;
class nsIDTD;
class nsHTMLParser : public nsIParser {
class nsHTMLParser : public nsIParser, public nsIStreamListener {
public:
friend class CTokenHandler;
@ -104,14 +107,6 @@ friend class CTokenHandler;
*/
virtual nsIContentSink* SetContentSink(nsIContentSink* aSink);
/**
* Cause parser to parse input from given URL
* @update gess5/11/98
* @param aURL is a descriptor for source document
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRBool Parse(nsIURL* aURL);
/**
* Cause parser to parse input from given URL in given mode
* @update gess5/11/98
@ -119,14 +114,31 @@ friend class CTokenHandler;
* @param aMode is the desired parser mode (Nav, other, etc.)
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRBool Parse(nsIURL* aURL,eParseMode aMode);
virtual PRInt32 Parse(nsIURL* aURL,PRBool aIncremental=PR_FALSE);
/**
* Cause parser to parse input from given file in given mode
* @update gess5/11/98
* @param aFilename is a path for file document
* @param aMode is the desired parser mode (Nav, other, etc.)
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRInt32 Parse(const char* aFilename,PRBool aIncremental);
/**
* @update gess5/11/98
* @param anHTMLString contains a string-full of real HTML
* @param appendTokens tells us whether we should insert tokens inline, or append them.
* @return TRUE if all went well -- FALSE otherwise
*/
virtual PRInt32 Parse(nsString& anHTMLString,PRBool appendTokens);
/**
* This method gets called (automatically) during incremental parsing
* @update gess5/11/98
* @return TRUE if all went well, otherwise FALSE
*/
virtual PRBool ResumeParse();
virtual PRInt32 ResumeParse(PRInt32 anIteration);
/**
* Retrieve ptr to internal context vector stack
@ -230,6 +242,15 @@ friend class CTokenHandler;
*/
PRBool HandleStyleToken(CToken* aToken);
//*********************************************
// These methods are callback methods used by
// net lib to let us know about our inputstream.
//*********************************************
NS_IMETHOD GetBindInfo(void);
NS_IMETHOD OnProgress(PRInt32 Progress, PRInt32 ProgressMax, const char *msg);
NS_IMETHOD OnStartBinding(void);
NS_IMETHOD OnDataAvailable(nsIInputStream *pIStream, PRInt32 length);
NS_IMETHOD OnStopBinding(void);
protected:
@ -485,6 +506,10 @@ protected:
*/
PRBool CreateContextStackFor(PRInt32 aChildTag);
private:
PRInt32 ParseFileIncrementally(const char* aFilename); //XXX ONLY FOR DEBUG PURPOSES...
protected:
//*********************************************
// And now, some data members...
//*********************************************
@ -502,6 +527,8 @@ protected:
nsIDTD* mDTD;
eParseMode mParseMode;
PRBool mHasOpenForm;
PRBool mIncremental;
ITokenizerDelegate* mDelegate;
};

View File

@ -40,6 +40,7 @@
class nsIContentSink;
class nsString;
/**
* This class defines the iparser interface. This XPCOM
@ -52,10 +53,14 @@ class nsIParser : public nsISupports {
public:
virtual nsIContentSink* SetContentSink(nsIContentSink* aContentSink)=0;
virtual PRBool Parse(nsIURL* aURL)=0;
virtual PRBool ResumeParse()=0;
virtual PRInt32 GetStack(PRInt32* aStackPtr)=0;
virtual PRBool HasOpenContainer(PRInt32 aContainer) const=0;
virtual PRInt32 Parse(nsIURL* aURL,PRBool aIncremental=PR_FALSE)=0;
virtual PRInt32 Parse(const char* aFilename,PRBool aIncremental)=0;
virtual PRInt32 Parse(nsString& anHTMLString,PRBool appendTokens)=0;
virtual PRInt32 ResumeParse(PRInt32 anIterator)=0;
virtual PRInt32 GetStack(PRInt32* aStackPtr)=0;
virtual PRBool HasOpenContainer(PRInt32 aContainer) const=0;
};
extern NS_HTMLPARS nsresult NS_NewHTMLParser(nsIParser** aInstancePtrResult);

View File

@ -37,13 +37,16 @@ enum eParseMode {
eParseMode_unknown=0,
eParseMode_navigator,
eParseMode_other
eParseMode_other,
eParseMode_autodetect
};
const PRInt32 kEOF = 1000000L;
const PRInt32 kBadFilename = -4;
const PRInt32 kBadURL = -3;
const PRInt32 kInterrupted = -2;
const PRInt32 kNotFound = -1;
const PRInt32 kNoError = 0;
const PRInt32 kInterrupted = 2;
const PRUint32 kNewLine = '\n';
const PRUint32 kCR = '\r';

View File

@ -22,9 +22,8 @@
#include "nsIURL.h"
#include "nsDebug.h"
const char* gURLRef;
const char* kBadHTMLText1="<HTML><BODY><H3>Oops...</H3>You just tried to read a non-existent document: <BR>";
const char* kBadHTMLText2="</BODY></HTML>";
const char* gURLRef=0;
const char* kBadHTMLText="<H3>Oops...</H3>You just tried to read a non-existent document: <BR>";
#ifdef __INCREMENTAL
const int kBufsize=1;
@ -33,31 +32,63 @@ const int kBufsize=64;
#endif
/**
* default constructor
*
* @update gess 3/25/98
* @param aURL -- pointer to URL to be loaded
* Use this constructor if you want an incremental (callback)
* based input stream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner::CScanner(eParseMode aMode) : mBuffer("") {
mOffset=0;
mMarkPos=-1;
mTotalRead=0;
mParseMode=aMode;
mNetStream=0;
mFileStream=0;
mIncremental=PR_TRUE;
}
/**
* Use this constructor if you want i/o to be file based.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner::CScanner(const char* aFilename,eParseMode aMode) : mBuffer("") {
NS_ASSERTION(0!=aFilename,"Error: Null filename!");
mOffset=0;
mMarkPos=-1;
mTotalRead=0;
mParseMode=aMode;
mNetStream=0;
mIncremental=PR_FALSE;
mFileStream=new fstream(aFilename,ios::in|ios::binary);
}
/**
* Use this constructor if you want i/o to be based on a
* non-incremental netstream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner::CScanner(nsIURL* aURL,eParseMode aMode) : mBuffer("") {
NS_ASSERTION(0!=aURL,"Error: Null URL!");
mOffset=0;
mStream=0;
mMarkPos=-1;
mTotalRead=0;
mParseMode=aMode;
if(aURL) {
gURLRef=aURL->GetSpec();
#ifdef __INCREMENTAL
mStream=new fstream("c:/temp/temp.html",ios::in|ios::binary);
#else
int error;
mStream=aURL->Open(&error);
#endif
}
mFileStream=0;
PRInt32 error=0;
mIncremental=PR_FALSE;
mNetStream=aURL->Open(&error);
gURLRef=aURL->GetSpec();
}
/**
* default destructor
*
@ -66,19 +97,107 @@ CScanner::CScanner(nsIURL* aURL,eParseMode aMode) : mBuffer("") {
* @return
*/
CScanner::~CScanner() {
#ifdef __INCREMENTAL
mStream->close();
delete mStream;
mStream=0;
#else
if(mStream) {
mStream->Close();
mStream->Release();
mStream=0;
if(mFileStream) {
mFileStream->close();
delete mFileStream;
}
#endif
else if(mNetStream) {
mNetStream->Close();
mNetStream->Release();
}
mFileStream=0;
mNetStream=0;
gURLRef=0;
}
/**
* Resets current offset position of input stream to marked position.
* This allows us to back up to this point if the need should arise,
* such as when tokenization gets interrupted.
* NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 CScanner::RewindToMark(void){
mOffset=mMarkPos;
return mOffset;
}
/**
* Records current offset position in input stream. This allows us
* to back up to this point if the need should arise, such as when
* tokenization gets interrupted.
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 CScanner::Mark(void){
mMarkPos=mOffset;
return mMarkPos;
}
/**
*
* @update gess 5/12/98
*/
void _PreCompressBuffer(nsString& aBuffer,PRInt32& anOffset,PRInt32& aMarkPos){
//To determine how much of our internal buffer to truncate,
//we should check mMarkPos. That represents the point at which
//we've guaranteed the client we can back up to, so make sure
//you don't lose any of the data beyond that point.
if((anOffset!=aMarkPos) && (0<=aMarkPos)) {
if(aMarkPos>0) {
aBuffer.Cut(0,aMarkPos);
if(anOffset>aMarkPos)
anOffset-=aMarkPos;
}
}
else aBuffer.Truncate();
aMarkPos=0;
}
/**
* This method should only be called by the parser when
* we're doing incremental i/o over the net.
*
* @update gess 5/12/98
* @param aBuffer contains next blob of i/o data
* @param aSize contains size of buffer
* @return 0 if all went well, otherwise error code.
*/
PRInt32 CScanner::IncrementalAppend(const char* aBuffer,PRInt32 aSize){
NS_ASSERTION(((!mFileStream) && (!mNetStream)),"Error: Should only be called during incremental net i/o!");
PRInt32 result=0;
if((!mFileStream) && (!mNetStream)) {
_PreCompressBuffer(mBuffer,mOffset,mMarkPos);
//now that the buffer is (possibly) shortened, let's append the new data.
if(0<aSize) {
mBuffer.Append(aBuffer,aSize);
mTotalRead+=aSize;
}
}
return result;
}
/**
* Grab data from underlying stream.
*
* @update gess4/3/98
* @return error code
*/
PRBool CScanner::Append(nsString& aBuffer) {
_PreCompressBuffer(mBuffer,mOffset,mMarkPos);
mBuffer.Append(aBuffer);
return PR_TRUE;
}
/**
* Grab data from underlying stream.
@ -89,55 +208,63 @@ CScanner::~CScanner() {
PRInt32 CScanner::FillBuffer(void) {
PRInt32 anError=0;
mBuffer.Truncate();
if(!mStream) {
_PreCompressBuffer(mBuffer,mOffset,mMarkPos);
if((!mIncremental) && (!mNetStream) && (!mFileStream)) {
//This is DEBUG code!!!!!! XXX DEBUG XXX
//If you're here, it means someone tried to load a
//non-existent document. So as a favor, we emit a
//little bit of HTML explaining the error.
if(0==mTotalRead) {
mBuffer.Append((const char*)kBadHTMLText1);
mBuffer.Append((const char*)kBadHTMLText);
mBuffer.Append((const char*)gURLRef);
mBuffer.Append((const char*)kBadHTMLText2);
}
else return 0;
}
else {
else if(!mIncremental) {
PRInt32 numread=0;
char buf[kBufsize+1];
buf[kBufsize]=0;
#ifdef __INCREMENTAL
mStream->read(buf,kBufsize);
numread=mStream->gcount();
#else
numread=mStream->Read(&anError,buf,0,kBufsize);
#endif
if(mFileStream) {
mFileStream->read(buf,kBufsize);
numread=mFileStream->gcount();
}
else if(mNetStream) {
numread=mNetStream->Read(&anError,buf,0,kBufsize);
if(1==anError)
anError=kEOF;
}
mOffset=mBuffer.Length();
if((0<numread) && (0==anError))
mBuffer.Append((const char*)buf,numread);
mTotalRead+=mBuffer.Length();
}
mTotalRead+=mBuffer.Length();
else anError=kInterrupted;
return anError;
}
/**
* determine if the scanner has reached EOF
*
* @update gess 3/25/98
* @update gess 5/12/98
* @param
* @return PR_TRUE upon eof condition
* @return 0=!eof 1=eof kInterrupted=interrupted
*/
PRBool CScanner::Eof() {
PRInt32 CScanner::Eof() {
PRInt32 theError=0;
if(mOffset>=mBuffer.Length()) {
theError=FillBuffer();
mOffset=0;
if(!mIncremental)
theError=FillBuffer();
else return kInterrupted;
}
PRBool result=PR_TRUE;
if(0==theError) {
result=PRBool(0==mBuffer.Length());
}
return result;
if(0==theError)
return (0==mBuffer.Length());
return theError;
}
/**
@ -148,11 +275,12 @@ PRBool CScanner::Eof() {
* @return error code reflecting read status
*/
PRInt32 CScanner::GetChar(PRUnichar& aChar) {
if(!Eof()) {
PRInt32 result=Eof();
if(!result) {
aChar=mBuffer[mOffset++];
return kNoError;
result=kNoError;
}
return kEOF;
return result;
}
@ -165,11 +293,12 @@ PRInt32 CScanner::GetChar(PRUnichar& aChar) {
* @return
*/
PRInt32 CScanner::Peek(PRUnichar& aChar){
if(!Eof()) {
PRInt32 result=Eof();
if(!result) {
aChar=mBuffer[mOffset];
return kNoError;
result=kNoError;
}
return kEOF;
return result;
}
@ -181,7 +310,9 @@ PRInt32 CScanner::Peek(PRUnichar& aChar){
* @return error code
*/
PRInt32 CScanner::PutBack(PRUnichar aChar) {
mOffset--;
if(mOffset>0)
mOffset--;
else mBuffer.Insert(aChar,0);
return kNoError;
}
@ -301,8 +432,8 @@ PRInt32 CScanner::ReadUntil(nsString& aString,nsString& aTerminalSet,PRBool addT
PRUnichar ch=0;
PRInt32 result=kNoError;
while(!Eof()) {
result=GetChar(ch);
while(!result) {
result=GetChar(ch);
if(kNoError==result) {
PRInt32 pos=aTerminalSet.Find(ch);
if(kNotFound!=pos) {

View File

@ -43,7 +43,36 @@ class ifstream;
class CScanner {
public:
/**
* Use this constructor if you want an incremental (callback)
* based input stream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner(eParseMode aMode=eParseMode_navigator);
/**
* Use this constructor if you want i/o to be based on a
* non-incremental netstream.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner(nsIURL* aURL,eParseMode aMode=eParseMode_navigator);
/**
* Use this constructor if you want i/o to be file based.
*
* @update gess 5/12/98
* @param aMode represents the parser mode (nav, other)
* @return
*/
CScanner(const char* aFilename,eParseMode aMode=eParseMode_navigator);
~CScanner();
/**
@ -117,7 +146,7 @@ class CScanner {
* @update gess 3/25/98
* @return PR_TRUE upon eof condition
*/
PRBool Eof(void);
PRInt32 Eof(void);
/**
* Consume characters until you find the terminal char
@ -153,6 +182,48 @@ class CScanner {
*/
PRInt32 ReadWhile(nsString& aString,nsString& anInputSet,PRBool addTerminal);
/**
* Records current offset position in input stream. This allows us
* to back up to this point if the need should arise, such as when
* tokenization gets interrupted.
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 Mark(void);
/**
* Resets current offset position of input stream to marked position.
* This allows us to back up to this point if the need should arise,
* such as when tokenization gets interrupted.
* NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 RewindToMark(void);
/**
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool Append(nsString& aBuffer);
/**
*
*
* @update gess 5/12/98
* @param
* @return
*/
PRInt32 IncrementalAppend(const char* aBuffer,PRInt32 aSize);
static void SelfTest();
protected:
@ -167,15 +238,15 @@ class CScanner {
*/
PRInt32 FillBuffer(void);
#ifdef __INCREMENTAL
fstream* mStream;
#else
nsIInputStream* mStream;
#endif
fstream* mFileStream;
nsIInputStream* mNetStream;
nsString mBuffer;
PRInt32 mOffset;
PRInt32 mMarkPos;
PRInt32 mTotalRead;
eParseMode mParseMode;
PRBool mIncremental;
};
#endif

View File

@ -39,6 +39,35 @@ CTokenizer::CTokenizer(nsIURL* aURL,ITokenizerDelegate* aDelegate,eParseMode aMo
mParseMode=aMode;
}
/**
* Default constructor
*
* @update gess 3/25/98
* @param aFilename -- name of file to be tokenized
* @param aDelegate -- ref to delegate to be used to tokenize
* @return
*/
CTokenizer::CTokenizer(const char* aFilename,ITokenizerDelegate* aDelegate,eParseMode aMode) :
mTokenDeque() {
mDelegate=aDelegate;
mScanner=new CScanner(aFilename,aMode);
mParseMode=aMode;
}
/**
* Default constructor
*
* @update gess 3/25/98
* @param aFilename -- name of file to be tokenized
* @param aDelegate -- ref to delegate to be used to tokenize
* @return
*/
CTokenizer::CTokenizer(ITokenizerDelegate* aDelegate,eParseMode aMode) :
mTokenDeque() {
mDelegate=aDelegate;
mScanner=new CScanner(aMode);
mParseMode=aMode;
}
/**
* default destructor
@ -54,6 +83,19 @@ CTokenizer::~CTokenizer() {
}
/**
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool CTokenizer::Append(nsString& aBuffer) {
if(mScanner)
return mScanner->Append(aBuffer);
return PR_FALSE;
}
/**
* Retrieve a reference to the internal token deque.
*
@ -105,31 +147,31 @@ PRBool CTokenizer::WillTokenize(PRBool aIncremental){
}
/**
* This is the primary control routine. It iteratively
* consumes tokens until an error occurs or you run out
* of data.
*
* @update gess 3/25/98
* @return error code
* @return TRUE if it's ok to proceed
*/
PRInt32 CTokenizer::Tokenize(void) {
PRInt32 CTokenizer::Tokenize(nsString& aSourceBuffer,PRBool appendTokens){
CToken* theToken=0;
PRInt32 result=kNoError;
WillTokenize(PR_TRUE);
while(kNoError==result) {
result=GetToken(theToken);
if(theToken && (kNoError==result)) {
if(WillTokenize(PR_FALSE)) {
do {
result=GetToken(theToken);
if(theToken) {
#ifdef VERBOSE_DEBUG
theToken->DebugDumpToken(cout);
#endif
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
}
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
}
} while(0!=theToken);
result=DidTokenize(PR_FALSE);
}
}
}
if(kEOF==result)
result=kNoError;
DidTokenize(PR_TRUE);
return result;
}
@ -141,20 +183,33 @@ PRInt32 CTokenizer::Tokenize(void) {
* @update gess 3/25/98
* @return error code
*/
PRInt32 CTokenizer::TokenizeAvailable(int anIteration) {
PRInt32 CTokenizer::Tokenize(int anIteration) {
CToken* theToken=0;
PRInt32 result=kNoError;
PRBool done=(0==anIteration) ? (!WillTokenize(PR_TRUE)) : PR_FALSE;
while((PR_FALSE==done) && (kInterrupted!=kInterrupted)) {
while((PR_FALSE==done) && (kNoError==result)) {
mScanner->Mark();
result=GetToken(theToken);
if(theToken) {
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
if(kNoError==result) {
if(theToken) {
#ifdef VERBOSE_DEBUG
theToken->DebugDumpToken(cout);
#endif
if(mDelegate->WillAddToken(*theToken)) {
mTokenDeque.Push(theToken);
}
}
}
else {
if(theToken)
delete theToken;
mScanner->RewindToMark();
}
else done=PR_TRUE;
}
if((PR_TRUE==done) && (kInterrupted!=result))
DidTokenize(PR_TRUE);

View File

@ -50,19 +50,13 @@ class nsIURL;
class CTokenizer {
public:
CTokenizer(nsIURL* aURL,ITokenizerDelegate* aDelegate,eParseMode aMode);
CTokenizer(ITokenizerDelegate* aDelegate,eParseMode aMode=eParseMode_navigator);
CTokenizer(const char* aFilename,ITokenizerDelegate* aDelegate,eParseMode aMode=eParseMode_navigator);
CTokenizer(nsIURL* aURL,ITokenizerDelegate* aDelegate,eParseMode aMode=eParseMode_navigator);
~CTokenizer();
/**
* This control routine causes the entire stream to be
* tokenized. You probably want to call TokenizeAvailable()
* instead (for incremental tokenization).
*
* @update gess 3/25/98
* @return TRUE if it's ok to proceed
*/
PRInt32 Tokenize(void);
/**
* This method incrementally tokenizes as much content as
* it can get its hands on.
@ -70,7 +64,14 @@ class CTokenizer {
* @update gess 3/25/98
* @return TRUE if it's ok to proceed
*/
PRInt32 TokenizeAvailable(int anIteration); //your friendly incremental version
PRInt32 Tokenize(int anIteration); //your friendly incremental version
/**
*
* @update gess 3/25/98
* @return TRUE if it's ok to proceed
*/
PRInt32 Tokenize(nsString& aSourceBuffer,PRBool appendTokens=PR_TRUE);
/**
* Cause the tokenizer to consume the next token, and
@ -98,6 +99,23 @@ class CTokenizer {
*/
nsDeque& GetDeque(void);
/**
*
* @update gess 4/20/98
* @return deque reference
*/
PRBool Append(nsString& aBuffer);
/**
*
*
* @update gess 5/13/98
* @param
* @return
*/
PRBool SetBuffer(nsString& aBuffer);
/**
* This debug routine is used to cause the tokenizer to
* iterate its token list, asking each token to dump its