r=heikki,sr=waterson,a=choffmann git-svn-id: svn://10.0.0.236/trunk@92885 18797224-902f-48f8-a5cc-f745e15eee43
2654 lines
82 KiB
C++
2654 lines
82 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/*
|
|
* The contents of this file are subject to the Netscape Public
|
|
* License Version 1.1 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS
|
|
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* rights and limitations under the License.
|
|
*
|
|
* The Original Code is mozilla.org code.
|
|
*
|
|
* The Initial Developer of the Original Code is Netscape
|
|
* Communications Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All
|
|
* Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* Pierre Phaneuf <pp@ludusdesign.com>
|
|
*/
|
|
|
|
#define DEBUG_XMLENCODING
|
|
#define XMLENCODING_PEEKBYTES 64
|
|
#define DISABLE_TRANSITIONAL_MODE
|
|
|
|
|
|
|
|
#include "nsParser.h"
|
|
#include "nsIContentSink.h"
|
|
#include "nsString.h"
|
|
#include "nsCRT.h"
|
|
#include "nsScanner.h"
|
|
#include "plstr.h"
|
|
#include "nsIParserFilter.h"
|
|
#include "nshtmlpars.h"
|
|
#include "nsWellFormedDTD.h"
|
|
#include "nsViewSourceHTML.h"
|
|
#include "nsIStringStream.h"
|
|
#include "nsIChannel.h"
|
|
#include "nsIProgressEventSink.h"
|
|
#include "nsIInputStream.h"
|
|
#include "CNavDTD.h"
|
|
#include "COtherDTD.h"
|
|
#include "prenv.h"
|
|
#include "nsParserCIID.h"
|
|
#include "nsCOMPtr.h"
|
|
//#define rickgdebug
|
|
|
|
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
|
|
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
|
|
static NS_DEFINE_IID(kIParserIID, NS_IPARSER_IID);
|
|
static NS_DEFINE_IID(kIStreamListenerIID, NS_ISTREAMLISTENER_IID);
|
|
|
|
static NS_DEFINE_CID(kWellFormedDTDCID, NS_WELLFORMEDDTD_CID);
|
|
static NS_DEFINE_CID(kNavDTDCID, NS_CNAVDTD_CID);
|
|
static NS_DEFINE_CID(kCOtherDTDCID, NS_COTHER_DTD_CID);
|
|
static NS_DEFINE_CID(kViewSourceDTDCID, NS_VIEWSOURCE_DTD_CID);
|
|
|
|
static const char* kNullURL = "Error: Null URL given";
|
|
static const char* kOnStartNotCalled = "Error: OnStartRequest() must be called before OnDataAvailable()";
|
|
static const char* kBadListenerInit = "Error: Parser's IStreamListener API was not setup correctly in constructor.";
|
|
|
|
//-------------------------------------------------------------------
|
|
|
|
|
|
class CDTDDeallocator: public nsDequeFunctor{
|
|
public:
|
|
virtual void* operator()(void* anObject) {
|
|
nsIDTD* aDTD =(nsIDTD*)anObject;
|
|
NS_RELEASE(aDTD);
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
//-------------------------------------------------------------------
|
|
|
|
class CDTDFinder: public nsDequeFunctor{
|
|
public:
|
|
CDTDFinder(nsIDTD* aDTD) {
|
|
mTargetDTD=aDTD;
|
|
}
|
|
virtual ~CDTDFinder() {
|
|
}
|
|
virtual void* operator()(void* anObject) {
|
|
nsIDTD* theDTD=(nsIDTD*)anObject;
|
|
if(theDTD->GetMostDerivedIID().Equals(mTargetDTD->GetMostDerivedIID()))
|
|
return anObject;
|
|
return 0;
|
|
}
|
|
nsIDTD* mTargetDTD;
|
|
};
|
|
|
|
//-------------------------------------------------------------------
|
|
|
|
class CSharedParserObjects {
|
|
public:
|
|
|
|
CSharedParserObjects()
|
|
:mDTDDeque(0),
|
|
mHasViewSourceDTD(PR_FALSE),
|
|
mHasXMLDTD(PR_FALSE),
|
|
mOtherDTD(nsnull)
|
|
{
|
|
|
|
//Note: To cut down on startup time/overhead, we defer the construction of non-html DTD's.
|
|
|
|
nsIDTD* theDTD;
|
|
|
|
const char* theStrictDTDEnabled=PR_GetEnv("ENABLE_STRICT"); //always false (except rickg's machine)
|
|
|
|
if(theStrictDTDEnabled) {
|
|
NS_NewOtherHTMLDTD(&mOtherDTD); //do this as the default DTD for strict documents...
|
|
mDTDDeque.Push(mOtherDTD);
|
|
}
|
|
|
|
NS_NewNavHTMLDTD(&theDTD); //do this as a default HTML DTD...
|
|
|
|
// please handle allocation failure
|
|
NS_ASSERTION(theDTD, "Failed to create DTD");
|
|
|
|
mDTDDeque.Push(theDTD);
|
|
|
|
mHasViewSourceDTD=PR_FALSE;
|
|
mHasXMLDTD=PR_FALSE;
|
|
}
|
|
|
|
~CSharedParserObjects() {
|
|
CDTDDeallocator theDeallocator;
|
|
mDTDDeque.ForEach(theDeallocator); //release all the DTD's
|
|
}
|
|
|
|
void RegisterDTD(nsIDTD* aDTD){
|
|
if(aDTD) {
|
|
NS_ADDREF(aDTD);
|
|
CDTDFinder theFinder(aDTD);
|
|
if(!mDTDDeque.FirstThat(theFinder)) {
|
|
nsIDTD* theDTD;
|
|
aDTD->CreateNewInstance(&theDTD);
|
|
mDTDDeque.Push(theDTD);
|
|
}
|
|
NS_RELEASE(aDTD);
|
|
}
|
|
}
|
|
|
|
nsDeque mDTDDeque;
|
|
PRBool mHasViewSourceDTD; //this allows us to defer construction of this object.
|
|
PRBool mHasXMLDTD; //also defer XML dtd construction
|
|
nsIDTD *mOtherDTD; //it's ok to leak this; the deque contains a copy too.
|
|
};
|
|
|
|
static CSharedParserObjects* gSharedParserObjects=0;
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
/**********************************************************************************
|
|
This class is used as an interface between an external agent (like the DOM) and
|
|
the parser. It will contain a stack full of tagnames, which is used in our
|
|
parser/paste API's.
|
|
**********************************************************************************/
|
|
|
|
class nsTagStack : public nsITagStack {
|
|
public:
|
|
nsTagStack() : nsITagStack(), mTags(0) {
|
|
}
|
|
|
|
virtual ~nsTagStack() {
|
|
}
|
|
|
|
virtual void Push(PRUnichar* aTag){
|
|
mTags.Push(aTag);
|
|
}
|
|
|
|
virtual PRUnichar* Pop(void){
|
|
PRUnichar* result=(PRUnichar*)mTags.Pop();
|
|
return result;
|
|
}
|
|
|
|
virtual PRUnichar* TagAt(PRUint32 anIndex){
|
|
PRUnichar* result=0;
|
|
if(anIndex<(PRUint32)mTags.GetSize())
|
|
result=(PRUnichar*)mTags.ObjectAt(anIndex);
|
|
return result;
|
|
}
|
|
|
|
virtual PRUint32 GetSize(void){
|
|
return mTags.GetSize();
|
|
}
|
|
|
|
nsDeque mTags; //will hold a deque of prunichars...
|
|
};
|
|
|
|
CSharedParserObjects& GetSharedObjects() {
|
|
if (!gSharedParserObjects) {
|
|
gSharedParserObjects = new CSharedParserObjects();
|
|
}
|
|
return *gSharedParserObjects;
|
|
}
|
|
|
|
/**
|
|
* This gets called when the htmlparser module is shutdown.
|
|
*
|
|
* @update gess 01/04/99
|
|
*/
|
|
void nsParser::FreeSharedObjects(void) {
|
|
if (gSharedParserObjects) {
|
|
delete gSharedParserObjects;
|
|
gSharedParserObjects=0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
static PRBool gDumpContent=PR_FALSE;
|
|
|
|
/**
|
|
* default constructor
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param
|
|
* @return
|
|
*/
|
|
nsParser::nsParser(nsITokenObserver* anObserver) {
|
|
NS_INIT_REFCNT();
|
|
|
|
#ifdef NS_DEBUG
|
|
if(!gDumpContent) {
|
|
gDumpContent=(PR_GetEnv("PARSER_DUMP_CONTENT"))? PR_TRUE:PR_FALSE;
|
|
}
|
|
#endif
|
|
|
|
mCharset.AssignWithConversion("ISO-8859-1");
|
|
mParserFilter = 0;
|
|
mObserver = 0;
|
|
mProgressEventSink = nsnull;
|
|
mSink=0;
|
|
mParserContext=0;
|
|
mTokenObserver=anObserver;
|
|
mStreamStatus=0;
|
|
mDTDVerification=PR_FALSE;
|
|
mCharsetSource=kCharsetUninitialized;
|
|
mInternalState=NS_OK;
|
|
mObserversEnabled=PR_TRUE;
|
|
mCommand=eViewNormal;
|
|
mParserEnabled=PR_TRUE;
|
|
mBundle=nsnull;
|
|
|
|
MOZ_TIMER_DEBUGLOG(("Reset: Parse Time: nsParser::nsParser(), this=%p\n", this));
|
|
MOZ_TIMER_RESET(mParseTime);
|
|
MOZ_TIMER_RESET(mDTDTime);
|
|
MOZ_TIMER_RESET(mTokenizeTime);
|
|
}
|
|
|
|
/**
|
|
* Default destructor
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param
|
|
* @return
|
|
*/
|
|
nsParser::~nsParser() {
|
|
|
|
#ifdef NS_DEBUG
|
|
if(gDumpContent) {
|
|
if(mSink) {
|
|
// Sink ( HTMLContentSink at this time) supports nsIDebugDumpContent
|
|
// interface. We can get to the content model through the sink.
|
|
nsresult result=NS_OK;
|
|
nsCOMPtr<nsIDebugDumpContent> trigger=do_QueryInterface(mSink,&result);
|
|
if(NS_SUCCEEDED(result)) {
|
|
trigger->DumpContentModel();
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
NS_IF_RELEASE(mObserver);
|
|
NS_IF_RELEASE(mProgressEventSink);
|
|
NS_IF_RELEASE(mSink);
|
|
NS_IF_RELEASE(mParserFilter);
|
|
NS_IF_RELEASE(mBundle);
|
|
|
|
//don't forget to add code here to delete
|
|
//what may be several contexts...
|
|
delete mParserContext;
|
|
}
|
|
|
|
|
|
NS_IMPL_ADDREF(nsParser)
|
|
NS_IMPL_RELEASE(nsParser)
|
|
//NS_IMPL_ISUPPORTS(nsParser,NS_IHTML_HTMLPARSER_IID)
|
|
|
|
|
|
/**
|
|
* This method gets called as part of our COM-like interfaces.
|
|
* Its purpose is to create an interface to parser object
|
|
* of some type.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param nsIID id of object to discover
|
|
* @param aInstancePtr ptr to newly discovered interface
|
|
* @return NS_xxx result code
|
|
*/
|
|
nsresult nsParser::QueryInterface(const nsIID& aIID, void** aInstancePtr)
|
|
{
|
|
if (NULL == aInstancePtr) {
|
|
return NS_ERROR_NULL_POINTER;
|
|
}
|
|
|
|
if(aIID.Equals(kISupportsIID)) { //do IUnknown...
|
|
*aInstancePtr = (nsIParser*)(this);
|
|
}
|
|
else if(aIID.Equals(kIParserIID)) { //do IParser base class...
|
|
*aInstancePtr = (nsIParser*)(this);
|
|
}
|
|
else if(aIID.Equals(NS_GET_IID(nsIProgressEventSink))) {
|
|
*aInstancePtr = (nsIStreamListener*)(this);
|
|
}
|
|
else if(aIID.Equals(NS_GET_IID(nsIRequestObserver))) {
|
|
*aInstancePtr = (nsIRequestObserver*)(this);
|
|
}
|
|
else if(aIID.Equals(NS_GET_IID(nsIStreamListener))) {
|
|
*aInstancePtr = (nsIStreamListener*)(this);
|
|
}
|
|
else if(aIID.Equals(kCParserCID)) { //do this class...
|
|
*aInstancePtr = (nsParser*)(this);
|
|
}
|
|
else if(aIID.Equals(NS_GET_IID(nsISupportsParserBundle))) {
|
|
*aInstancePtr = (nsISupportsParserBundle*)(this);
|
|
}
|
|
else {
|
|
*aInstancePtr=0;
|
|
return NS_NOINTERFACE;
|
|
}
|
|
NS_ADDREF_THIS();
|
|
return NS_OK;
|
|
}
|
|
|
|
|
|
/**
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param
|
|
* @return
|
|
*/
|
|
nsIParserFilter * nsParser::SetParserFilter(nsIParserFilter * aFilter)
|
|
{
|
|
nsIParserFilter* old=mParserFilter;
|
|
if(old)
|
|
NS_RELEASE(old);
|
|
if(aFilter) {
|
|
mParserFilter=aFilter;
|
|
NS_ADDREF(aFilter);
|
|
}
|
|
return old;
|
|
}
|
|
|
|
|
|
void nsParser::GetCommand(nsString& aCommand)
|
|
{
|
|
aCommand = mCommandStr;
|
|
}
|
|
|
|
/**
|
|
* Call this method once you've created a parser, and want to instruct it
|
|
* about the command which caused the parser to be constructed. For example,
|
|
* this allows us to select a DTD which can do, say, view-source.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param aContentSink -- ptr to content sink that will receive output
|
|
* @return ptr to previously set contentsink (usually null)
|
|
*/
|
|
void nsParser::SetCommand(const char* aCommand){
|
|
nsCAutoString theCommand(aCommand);
|
|
if(theCommand.Equals(kViewSourceCommand))
|
|
mCommand=eViewSource;
|
|
else mCommand=eViewNormal;
|
|
mCommandStr.AssignWithConversion(aCommand);
|
|
}
|
|
|
|
/**
|
|
* Call this method once you've created a parser, and want to instruct it
|
|
* about the command which caused the parser to be constructed. For example,
|
|
* this allows us to select a DTD which can do, say, view-source.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param aContentSink -- ptr to content sink that will receive output
|
|
* @return ptr to previously set contentsink (usually null)
|
|
*/
|
|
void nsParser::SetCommand(eParserCommands aParserCommand){
|
|
mCommand=aParserCommand;
|
|
}
|
|
|
|
|
|
/**
|
|
* Call this method once you've created a parser, and want to instruct it
|
|
* about what charset to load
|
|
*
|
|
* @update ftang 4/23/99
|
|
* @param aCharset- the charest of a document
|
|
* @param aCharsetSource- the soure of the chares
|
|
* @return nada
|
|
*/
|
|
void nsParser::SetDocumentCharset(nsString& aCharset, nsCharsetSource aCharsetSource){
|
|
mCharset = aCharset;
|
|
mCharsetSource = aCharsetSource;
|
|
if(mParserContext && mParserContext->mScanner)
|
|
mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
|
|
}
|
|
|
|
/**
|
|
* This method gets called in order to set the content
|
|
* sink for this parser to dump nodes to.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param nsIContentSink interface for node receiver
|
|
* @return
|
|
*/
|
|
nsIContentSink* nsParser::SetContentSink(nsIContentSink* aSink) {
|
|
NS_PRECONDITION(0!=aSink,"sink cannot be null!");
|
|
nsIContentSink* old=mSink;
|
|
|
|
NS_IF_RELEASE(old);
|
|
if(aSink) {
|
|
mSink=aSink;
|
|
NS_ADDREF(aSink);
|
|
mSink->SetParser(this);
|
|
}
|
|
return old;
|
|
}
|
|
|
|
/**
|
|
* retrive the sink set into the parser
|
|
* @update gess5/11/98
|
|
* @param aSink is the new sink to be used by parser
|
|
* @return old sink, or NULL
|
|
*/
|
|
nsIContentSink* nsParser::GetContentSink(void){
|
|
return mSink;
|
|
}
|
|
|
|
/**
|
|
* Call this method when you want to
|
|
* register your dynamic DTD's with the parser.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param aDTD is the object to be registered.
|
|
* @return nothing.
|
|
*/
|
|
void nsParser::RegisterDTD(nsIDTD* aDTD){
|
|
CSharedParserObjects& theShare=GetSharedObjects();
|
|
theShare.RegisterDTD(aDTD);
|
|
}
|
|
|
|
/**
|
|
* Retrieve scanner from topmost parsecontext
|
|
*
|
|
* @update gess 01/04/99
|
|
* @return ptr to internal scanner
|
|
*/
|
|
nsScanner* nsParser::GetScanner(void){
|
|
if(mParserContext)
|
|
return mParserContext->mScanner;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Retrieve parsemode from topmost parser context
|
|
*
|
|
* @update gess 01/04/99
|
|
* @return parsemode
|
|
*/
|
|
nsDTDMode nsParser::GetParseMode(void){
|
|
if(mParserContext)
|
|
return mParserContext->mDTDMode;
|
|
return eDTDMode_unknown;
|
|
}
|
|
|
|
|
|
|
|
class CWordTokenizer {
|
|
public:
|
|
CWordTokenizer(nsString& aString,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
|
|
mLength=0;
|
|
mOffset=aStartOffset;
|
|
mMaxOffset=aMaxOffset;
|
|
mBuffer=aString.GetUnicode();
|
|
mEndBuffer=mBuffer+mMaxOffset;
|
|
}
|
|
|
|
//********************************************************************************
|
|
// Get offset of nth word in string.
|
|
// We define words as:
|
|
// 1) sequence of alphanum;
|
|
// 2) quoted substring
|
|
// 3) SGML comment -- ... --
|
|
// Returns offset of nth word, or -1 (if out of words).
|
|
//********************************************************************************
|
|
|
|
PRInt32 GetNextWord() {
|
|
|
|
const PRUnichar *cp=mBuffer+mOffset+mLength; //skip last word
|
|
|
|
mLength=0; //reset this
|
|
mOffset=-1; //reset this
|
|
|
|
//now skip whitespace...
|
|
|
|
PRUnichar target=0;
|
|
PRBool done=PR_FALSE;
|
|
|
|
while((!done) && (cp++<mEndBuffer)) {
|
|
switch(*cp) {
|
|
case kSpace: case kNewLine:
|
|
case kCR: case kTab:
|
|
continue;
|
|
|
|
case kQuote:
|
|
case kMinus:
|
|
target=*cp;
|
|
done=PR_TRUE;
|
|
break;
|
|
|
|
default:
|
|
done=PR_TRUE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(cp<mEndBuffer) {
|
|
|
|
const PRUnichar *firstcp=cp; //hang onto this...
|
|
PRInt32 theDashCount=2;
|
|
|
|
cp++; //just skip first letter to simplify processing...
|
|
|
|
//ok, now find end of this word
|
|
while(cp++<mEndBuffer) {
|
|
if(kQuote==target) {
|
|
if(kQuote==*cp) {
|
|
cp++;
|
|
break; //we found our end...
|
|
}
|
|
}
|
|
else if(kMinus==target) {
|
|
//then let's look for SGML comments
|
|
if(kMinus==*cp) {
|
|
if(4==++theDashCount) {
|
|
cp++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
if((kSpace==*cp) ||
|
|
(kNewLine==*cp) ||
|
|
(kGreaterThan==*cp) ||
|
|
(kQuote==*cp) ||
|
|
(kCR==*cp) ||
|
|
(kTab==*cp)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
mLength=cp-firstcp;
|
|
mOffset = (0<mLength) ? firstcp-mBuffer : -1;
|
|
|
|
}
|
|
|
|
return mOffset;
|
|
}
|
|
|
|
PRInt32 mOffset;
|
|
PRInt32 mMaxOffset;
|
|
PRInt32 mLength;
|
|
const PRUnichar* mBuffer;
|
|
const PRUnichar* mEndBuffer;
|
|
};
|
|
|
|
|
|
/*************************************************************************************************
|
|
First, let's define our modalities:
|
|
|
|
1. compatibility-mode: behave as much like nav4 as possible (unless it's too broken to bother)
|
|
2. standard-mode: do html as well as you can per spec, and throw out navigator quirks
|
|
3. strict-mode: adhere to the strict DTD specificiation to the highest degree possible
|
|
|
|
Assume the doctype is in the following form:
|
|
<!DOCTYPE [Top Level Element] [Availability] "[Registration]// [Owner-ID] // [Type] [desc-text] // [Language]" "URI|text-identifier">
|
|
[HTML] [PUBLIC|SYTEM] [+|-] [W3C|IETF|...] [DTD] "..." [EN]|...] "..."
|
|
|
|
|
|
Here are the new rules for DTD handling; comments welcome:
|
|
|
|
XHTML and XML documents are always strict-mode:
|
|
example: <!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">
|
|
|
|
HTML strict dtd's enable strict-mode:
|
|
example: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
|
|
example: <!DOCTYPE \"ISO/IEC 15445:1999//DTD HTML//EN\">
|
|
|
|
HTML 4.0 (or greater) transitional, frameset, (etc), without URI enables compatibility-mode:
|
|
example: <!DOCTYPE \"-//W3C//DTD HTML 4.01 Transitional//EN\">
|
|
|
|
HTML 4.0 (or greater) transitional, frameset, (etc), with a URI that points to the strict.dtd will become strict:
|
|
example: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
|
|
"http://www.w3.org/TR/REC-html40/strict.dtd">
|
|
|
|
doctypes with systemID's or internal subset are handled in strict-mode:
|
|
example: <!DOCTYPE HTML PUBLIC PublicID SystemID>
|
|
example: <!DOCTYPE HTML SYSTEM SystemID>
|
|
example: <!DOCTYPE HTML (PUBLIC PublicID SystemID? | SYSTEM SystemID) [ Internal-SS ]>
|
|
|
|
All other doctypes (<4.0), and documents without a doctype are handled in compatibility-mode.
|
|
|
|
*****************************************************************************************************/
|
|
|
|
static
|
|
PRBool IsLoosePI(nsString& aBuffer,PRInt32 anOffset,PRInt32 aCount) {
|
|
PRBool result=PR_FALSE;
|
|
|
|
if((aBuffer.Find("TRANSITIONAL",PR_TRUE,anOffset,aCount)>kNotFound)||
|
|
(aBuffer.Find("LOOSE",PR_TRUE,anOffset,aCount)>kNotFound) ||
|
|
(aBuffer.Find("FRAMESET",PR_TRUE,anOffset,aCount)>kNotFound) ||
|
|
(aBuffer.Find("LATIN1", PR_TRUE,anOffset,aCount) >kNotFound) ||
|
|
(aBuffer.Find("SYMBOLS",PR_TRUE,anOffset,aCount) >kNotFound) ||
|
|
(aBuffer.Find("SPECIAL",PR_TRUE,anOffset,aCount) >kNotFound)) {
|
|
|
|
result=PR_TRUE;
|
|
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* This is called when it's time to find out
|
|
* what mode the parser/DTD should run for this document.
|
|
* (Each parsercontext can have it's own mode).
|
|
*
|
|
* @update gess 06/24/00
|
|
* @return parsermode (define in nsIParser.h)
|
|
*/
|
|
static
|
|
void DetermineParseMode(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
|
|
const char* theModeStr= PR_GetEnv("PARSE_MODE");
|
|
|
|
aParseMode=eDTDMode_quirks;
|
|
aDocType=eHTML3Text;
|
|
|
|
nsDTDMode thePublicID=eDTDMode_quirks;
|
|
nsDTDMode theSystemID=eDTDMode_unknown;
|
|
|
|
PRBool theMimeTypeIsHTML=aMimeType.EqualsWithConversion(kHTMLTextContentType);
|
|
|
|
//let's eliminate non-HTML as quickly as possible...
|
|
|
|
PRInt32 theIndex=aBuffer.Find("?XML",PR_TRUE,0,128);
|
|
if(kNotFound!=theIndex) {
|
|
aParseMode=eDTDMode_strict;
|
|
if(theMimeTypeIsHTML) {
|
|
//this is here to prevent a crash if someone gives us an XML document,
|
|
//but necko tells us it's a text/html mimetype.
|
|
aDocType=eHTML4Text;
|
|
aParseMode=eDTDMode_strict;
|
|
}
|
|
else {
|
|
if(!aMimeType.EqualsWithConversion(kPlainTextContentType)) {
|
|
aDocType=eXMLText;
|
|
aParseMode=eDTDMode_strict;
|
|
theSystemID=thePublicID=eDTDMode_strict;
|
|
}
|
|
else aDocType=ePlainText;
|
|
}
|
|
}
|
|
else if(aMimeType.EqualsWithConversion(kPlainTextContentType)) {
|
|
aDocType=ePlainText;
|
|
aParseMode=eDTDMode_quirks;
|
|
return;
|
|
}
|
|
else if(aMimeType.EqualsWithConversion(kTextCSSContentType)) {
|
|
aDocType=ePlainText;
|
|
aParseMode=eDTDMode_quirks;
|
|
return;
|
|
}
|
|
|
|
|
|
//now let's see if we have HTML or XHTML...
|
|
|
|
PRInt32 theOffset=0;
|
|
PRInt32 theDocTypePos=aBuffer.Find("!DOCTYPE",PR_TRUE,0,500); //find doctype
|
|
if(kNotFound!=theDocTypePos){
|
|
theOffset=theDocTypePos-2;
|
|
}
|
|
|
|
PRInt32 theLTPos=aBuffer.FindChar(kLessThan,PR_FALSE,theOffset);
|
|
PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan,PR_FALSE,theOffset);
|
|
|
|
PRInt32 theMajorVersion=3;
|
|
PRInt32 theMinorVersion=0;
|
|
|
|
if((kNotFound!=theGTPos) && (kNotFound!=theLTPos)) {
|
|
|
|
const PRUnichar* theBuffer=aBuffer.GetUnicode();
|
|
CWordTokenizer theTokenizer(aBuffer,theLTPos,theGTPos);
|
|
theOffset=theTokenizer.GetNextWord(); //try to find ?xml, !doctype, etc...
|
|
|
|
if((kNotFound!=theOffset) && (kNotFound!=theDocTypePos)) {
|
|
|
|
//Ok -- so assume it's (X)HTML; now figure out the flavor...
|
|
|
|
PRInt32 theIter=0; //prevent infinite loops...
|
|
PRBool done=PR_FALSE; //use this to quit if we find garbage...
|
|
PRBool readSystemID=PR_FALSE;
|
|
|
|
theOffset=theTokenizer.GetNextWord();
|
|
|
|
while((kNotFound!=theOffset) && (!done)) {
|
|
|
|
PRUnichar theChar=*(theBuffer+theTokenizer.mOffset);
|
|
if(kQuote==theChar) {
|
|
|
|
if(readSystemID) {
|
|
|
|
PRInt32 thePrefix=aBuffer.Find("http://www.w3.org/tr/",PR_TRUE,theOffset,5); //find the prefix
|
|
|
|
if(kNotFound!=thePrefix) {
|
|
thePrefix+=20;
|
|
if(IsLoosePI(aBuffer,thePrefix,25)) { //find loose.dtd
|
|
theSystemID=eDTDMode_transitional;
|
|
}
|
|
else if(kNotFound!=aBuffer.Find("strict.dtd",PR_TRUE,thePrefix,25)) { //find strict.dtd
|
|
theSystemID=eDTDMode_strict;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
else { //the public ID...
|
|
|
|
readSystemID=PR_TRUE;
|
|
|
|
PRInt32 theDTDPos=aBuffer.Find("//DTD",PR_TRUE,theOffset,theTokenizer.mLength);
|
|
if(theDTDPos) {
|
|
|
|
//first, let's see if it's XHML...
|
|
PRInt32 theMLTagPos=aBuffer.Find("XHTML",PR_TRUE,theOffset,theTokenizer.mLength);
|
|
if(kNotFound!=theMLTagPos) {
|
|
aDocType=eXHTMLText;
|
|
if(IsLoosePI(aBuffer,theMLTagPos+4,20))
|
|
thePublicID=eDTDMode_transitional;
|
|
else thePublicID=eDTDMode_strict;
|
|
}
|
|
|
|
else {
|
|
|
|
//now check for strict ISO/IEC OWNER...
|
|
if(kNotFound!=aBuffer.Find("15445:1999",PR_FALSE,theOffset,theDTDPos-theTokenizer.mOffset)) {
|
|
thePublicID=eDTDMode_strict; //this ISO/IEC DTD is always strict.
|
|
aDocType=eHTML4Text;
|
|
}
|
|
|
|
else {
|
|
|
|
//for W3C DTD's, let's make sure it's HTML...
|
|
PRInt32 theMLTagPos=aBuffer.Find("HTML",PR_TRUE,theOffset,theTokenizer.mLength);
|
|
if(kNotFound==theMLTagPos) {
|
|
theMLTagPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theOffset,theTokenizer.mLength);
|
|
}
|
|
|
|
if(kNotFound!=theMLTagPos) {
|
|
//and now check the version number...
|
|
|
|
PRInt32 theVersionPos=aBuffer.FindCharInSet("1234567890",theMLTagPos);
|
|
|
|
if((0<=theVersionPos) && (theVersionPos<theGTPos)) {
|
|
nsAutoString theNum;
|
|
PRInt32 theTerminal=aBuffer.FindCharInSet(" />",theVersionPos+1);
|
|
if(theTerminal) {
|
|
aBuffer.Mid(theNum,theVersionPos,theTerminal-theVersionPos);
|
|
}
|
|
else aBuffer.Mid(theNum,theVersionPos,4);
|
|
PRInt32 theErr=0;
|
|
theMajorVersion=theNum.ToInteger(&theErr);
|
|
if('.'==theNum[1]) {
|
|
theNum.Cut(0,2);
|
|
theMinorVersion=theNum.ToInteger(&theErr);
|
|
}
|
|
|
|
if((0==theErr) && (3<theMajorVersion) && (theMajorVersion<100)) {
|
|
if(IsLoosePI(aBuffer,theVersionPos+2,20))
|
|
thePublicID=eDTDMode_transitional;
|
|
else thePublicID=eDTDMode_strict;
|
|
aDocType=eHTML4Text;
|
|
}
|
|
} //if
|
|
} //if
|
|
} //else
|
|
|
|
} //else
|
|
}
|
|
|
|
} //if publicID
|
|
|
|
} //if quote
|
|
|
|
else if(kMinus==theChar) {
|
|
//explicitly skip comments...
|
|
}
|
|
|
|
else { //handle an id
|
|
if(0==nsCRT::strncasecmp(theBuffer+theOffset,"SYSTEM",theTokenizer.mLength))
|
|
readSystemID=PR_TRUE;
|
|
else if(0==nsCRT::strncasecmp(theBuffer+theOffset,"HTML",theTokenizer.mLength))
|
|
readSystemID=PR_FALSE;
|
|
}
|
|
|
|
theOffset=theTokenizer.GetNextWord();
|
|
if(++theIter>10) done=PR_TRUE; //prevent infinite loops...
|
|
} //while
|
|
|
|
|
|
if(theSystemID==thePublicID)
|
|
aParseMode=thePublicID;
|
|
else if(eDTDMode_unknown==theSystemID){
|
|
aParseMode=thePublicID;
|
|
if(eHTML4Text==aDocType) {
|
|
if (eDTDMode_transitional==thePublicID)
|
|
aParseMode=eDTDMode_quirks; //degrade because the systemID is missing.
|
|
}
|
|
}
|
|
else if((eDTDMode_transitional==thePublicID) && (eDTDMode_strict==theSystemID)) {
|
|
aParseMode=(4<=theMajorVersion) ? eDTDMode_strict : eDTDMode_quirks;
|
|
}
|
|
else {
|
|
//ack! The doctype is badly formed (system and public ID's contradict).
|
|
//let's switch back to default compatibility mode...
|
|
aParseMode=eDTDMode_unknown;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(eDTDMode_unknown==aParseMode) {
|
|
//nothing left to do but fail gracefully...
|
|
if(eXHTMLText==aDocType) {
|
|
aParseMode=eDTDMode_transitional;
|
|
}
|
|
if(eHTML4Text==aDocType) {
|
|
aDocType=eHTML3Text;
|
|
aParseMode=eDTDMode_quirks;
|
|
}
|
|
}
|
|
|
|
if(eXHTMLText==aDocType) {
|
|
aParseMode=eDTDMode_strict;
|
|
if(theMimeTypeIsHTML){
|
|
aDocType=eHTML4Text;
|
|
}
|
|
}
|
|
|
|
#ifdef DISABLE_TRANSITIONAL_MODE
|
|
|
|
/********************************************************************************************
|
|
The following code is here because to deal with a nasty backward compatibility problem.
|
|
The composer product emits <doctype HTML 4.0 Transitional> for the documents it creates,
|
|
but the documents aren't really compliant. To prevent lots of pages from breaking, well
|
|
disable proper handling of Transitional doctypes and use quirks mode instead. If lucky,
|
|
we'll get to add a pref to allow power users to get the right answer.
|
|
********************************************************************************************/
|
|
|
|
if(eDTDMode_transitional==aParseMode) {
|
|
if(eHTML4Text==aDocType)
|
|
aParseMode=(0==theMinorVersion) ? eDTDMode_quirks: eDTDMode_strict;
|
|
// else if(eXHTMLText==aDocType)
|
|
// aParseMode=eDTDMode_strict;
|
|
}
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
* This is called when it's time to find out
|
|
* what mode the parser/DTD should run for this document.
|
|
* (Each parsercontext can have it's own mode).
|
|
*
|
|
* @update gess 02/17/00
|
|
* @return parsermode (define in nsIParser.h)
|
|
*/
|
|
static
|
|
void DetermineParseMode2(nsString& aBuffer,nsDTDMode& aParseMode,eParserDocType& aDocType,const nsString& aMimeType) {
|
|
const char* theModeStr= PR_GetEnv("PARSE_MODE");
|
|
|
|
aParseMode = eDTDMode_unknown;
|
|
|
|
PRInt32 theGTPos=aBuffer.FindChar(kGreaterThan);
|
|
|
|
PRInt32 theIndex=aBuffer.Find("DOCTYPE",PR_TRUE,0,100);
|
|
if(kNotFound<theIndex) {
|
|
|
|
//good, we found "DOCTYPE" -- now go find it's end delimiter '>'
|
|
PRInt32 theEnd=(kNotFound==theGTPos) ? 512 : MinInt(512,theGTPos);
|
|
PRInt32 theSubIndex=aBuffer.Find("//DTD",PR_TRUE,theIndex+8,theEnd-(theIndex+8)); //skip to the type and desc-text...
|
|
PRInt32 theErr=0;
|
|
PRInt32 theMajorVersion=3;
|
|
|
|
//note that if we don't find '>', then we just scan the first 512 bytes.
|
|
|
|
if(kNotFound!=theSubIndex) {
|
|
|
|
//if you're here then we found the //DTD identifier type...
|
|
|
|
PRInt32 theStartPos=theSubIndex+5;
|
|
PRInt32 theCount=theEnd-theStartPos;
|
|
|
|
if(kNotFound!=aBuffer.Find("ISO/IEC 15445:1999",PR_TRUE,theIndex,theEnd-theIndex)) {
|
|
//per spec, this DTD is always strict...
|
|
aParseMode=eDTDMode_strict;
|
|
aDocType=eHTML4Text;
|
|
theMajorVersion=4;
|
|
return;
|
|
}
|
|
|
|
if (kNotFound!=aBuffer.Find("XHTML",PR_TRUE,theStartPos,theCount)) {
|
|
aDocType=eXHTMLText;
|
|
aParseMode=eDTDMode_strict;
|
|
return;
|
|
}
|
|
|
|
if(kNotFound<theSubIndex) {
|
|
|
|
//grab the next word
|
|
|
|
PRInt32 theHTMLTagPos=aBuffer.Find("HTML",PR_TRUE,theStartPos,theCount);
|
|
if(kNotFound==theHTMLTagPos) {
|
|
theHTMLTagPos=aBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theStartPos,theCount);
|
|
}
|
|
|
|
if(kNotFound!=theHTMLTagPos) {
|
|
|
|
//get the next substring from the buffer, which should be a number.
|
|
//now see what the version number is...
|
|
|
|
PRInt32 theVersionPos=aBuffer.FindCharInSet("1234567890",theHTMLTagPos);
|
|
if((0<=theVersionPos) && (theVersionPos<theEnd)) {
|
|
nsAutoString theNum;
|
|
PRInt32 theTerminal=aBuffer.FindCharInSet(" />",theVersionPos+1);
|
|
if(theTerminal) {
|
|
aBuffer.Mid(theNum,theVersionPos,theTerminal-theVersionPos);
|
|
}
|
|
else aBuffer.Mid(theNum,theVersionPos,3);
|
|
theMajorVersion=theNum.ToInteger(&theErr);
|
|
if(theMajorVersion>10) {
|
|
theMajorVersion=3; //assume that's an error for now.
|
|
}
|
|
}
|
|
|
|
//now let's see if we have descriptive text (providing strictness id)...
|
|
|
|
theStartPos=theVersionPos+2;
|
|
theCount=theEnd-theStartPos;
|
|
if((aBuffer.Find("TRANSITIONAL",PR_TRUE,theVersionPos+2,theCount)>kNotFound)||
|
|
(aBuffer.Find("LOOSE",PR_TRUE,theStartPos,theCount)>kNotFound) ||
|
|
(aBuffer.Find("FRAMESET",PR_TRUE,theStartPos,theCount)>kNotFound) ||
|
|
(aBuffer.Find("LATIN1", PR_TRUE,theStartPos,theCount) >kNotFound) ||
|
|
(aBuffer.Find("SYMBOLS",PR_TRUE,theStartPos,theCount) >kNotFound) ||
|
|
(aBuffer.Find("SPECIAL",PR_TRUE,theStartPos,theCount) >kNotFound)) {
|
|
|
|
//for now TRANSITIONAL 4.0 documents are handled in the compatibility DTD.
|
|
//Soon, they'll be handled in the strictDTD in transitional mode.
|
|
aParseMode=eDTDMode_transitional;
|
|
|
|
}
|
|
else {
|
|
//since we didn't find descriptive text, let's check out the URI...
|
|
//to see whether that specifies the strict.dtd...
|
|
theStartPos+=6;
|
|
theCount=theEnd-theStartPos;
|
|
theSubIndex=aBuffer.Find("STRICT.DTD",PR_TRUE,theStartPos,theCount);
|
|
if(0<theSubIndex) {
|
|
//Since we found it, regardless of what's in the descr-text, kick into strict mode.
|
|
aParseMode=eDTDMode_strict;
|
|
aDocType=eHTML4Text;
|
|
}
|
|
}
|
|
|
|
switch(theMajorVersion) {
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
aParseMode=eDTDMode_quirks;
|
|
aDocType=eHTML4Text;
|
|
break;
|
|
|
|
default:
|
|
if(eDTDMode_unknown==aParseMode) {
|
|
aParseMode=eDTDMode_strict;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
} //if
|
|
}
|
|
else if(kNotFound<(theIndex=aBuffer.Find("?XML",PR_TRUE,0,128))) {
|
|
aParseMode=eDTDMode_strict;
|
|
if(aMimeType.EqualsWithConversion(kHTMLTextContentType)) {
|
|
//this is here to prevent a crash if someone gives us an XML document,
|
|
//but necko tells us it's a text/html mimetype.
|
|
aDocType=eHTML4Text;
|
|
aParseMode=eDTDMode_strict;
|
|
}
|
|
else aDocType=eXMLText;
|
|
}
|
|
else if(aMimeType.EqualsWithConversion(kPlainTextContentType)) {
|
|
aDocType=ePlainText;
|
|
aParseMode=eDTDMode_quirks;
|
|
}
|
|
|
|
if(theModeStr) {
|
|
if(0==nsCRT::strcasecmp(theModeStr,"strict"))
|
|
aParseMode=eDTDMode_strict;
|
|
}
|
|
else {
|
|
if(eDTDMode_unknown==aParseMode) {
|
|
aDocType=eHTML3Text;
|
|
aParseMode=eDTDMode_quirks;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
*
|
|
* @update gess 5/13/98
|
|
* @param
|
|
* @return
|
|
*/
|
|
static
|
|
PRBool FindSuitableDTD( CParserContext& aParserContext,nsString& aBuffer) {
|
|
|
|
//Let's start by trying the defaultDTD, if one exists...
|
|
if(aParserContext.mDTD)
|
|
if(aParserContext.mDTD->CanParse(aParserContext,aBuffer,0))
|
|
return PR_TRUE;
|
|
|
|
CSharedParserObjects& gSharedObjects=GetSharedObjects();
|
|
aParserContext.mValidator=gSharedObjects.mOtherDTD;
|
|
|
|
aParserContext.mAutoDetectStatus=eUnknownDetect;
|
|
PRInt32 theDTDIndex=0;
|
|
nsIDTD* theBestDTD=0;
|
|
nsIDTD* theDTD=0;
|
|
PRBool thePrimaryFound=PR_FALSE;
|
|
|
|
while((theDTDIndex<=gSharedObjects.mDTDDeque.GetSize()) && (aParserContext.mAutoDetectStatus!=ePrimaryDetect)){
|
|
theDTD=(nsIDTD*)gSharedObjects.mDTDDeque.ObjectAt(theDTDIndex++);
|
|
if(theDTD) {
|
|
// Store detect status in temp ( theResult ) to avoid bugs such as
|
|
// 36233, 36754, 36491, 36323. Basically, we should avoid calling DTD's
|
|
// WillBuildModel() multiple times, i.e., we shouldn't leave auto-detect-status
|
|
// unknown.
|
|
eAutoDetectResult theResult=theDTD->CanParse(aParserContext,aBuffer,0);
|
|
if(eValidDetect==theResult){
|
|
aParserContext.mAutoDetectStatus=eValidDetect;
|
|
theBestDTD=theDTD;
|
|
}
|
|
else if(ePrimaryDetect==theResult) {
|
|
theBestDTD=theDTD;
|
|
thePrimaryFound=PR_TRUE;
|
|
aParserContext.mAutoDetectStatus=ePrimaryDetect;
|
|
}
|
|
}
|
|
if((theDTDIndex==gSharedObjects.mDTDDeque.GetSize()) && (!thePrimaryFound)) {
|
|
if(!gSharedObjects.mHasXMLDTD) {
|
|
NS_NewWellFormed_DTD(&theDTD); //do this to view XML files...
|
|
gSharedObjects.mDTDDeque.Push(theDTD);
|
|
gSharedObjects.mHasXMLDTD=PR_TRUE;
|
|
}
|
|
else if(!gSharedObjects.mHasViewSourceDTD) {
|
|
NS_NewViewSourceHTML(&theDTD); //do this so all non-html files can be viewed...
|
|
gSharedObjects.mDTDDeque.Push(theDTD);
|
|
gSharedObjects.mHasViewSourceDTD=PR_TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(theBestDTD) {
|
|
|
|
//#define FORCE_HTML_THROUGH_STRICT_DTD
|
|
#if FORCE_HTML_THROUGH_STRICT_DTD
|
|
if(theBestDTD==gSharedObjects.mDTDDeque.ObjectAt(0))
|
|
theBestDTD=(nsIDTD*)gSharedObjects.mDTDDeque.ObjectAt(1);
|
|
#endif
|
|
|
|
theBestDTD->CreateNewInstance(&aParserContext.mDTD);
|
|
return PR_TRUE;
|
|
}
|
|
return PR_FALSE;
|
|
}
|
|
|
|
/**
|
|
* Call this method to determine a DTD for a DOCTYPE
|
|
*
|
|
* @update harishd 05/01/00
|
|
* @param aDTD -- Carries the deduced ( from DOCTYPE ) DTD.
|
|
* @param aDocTypeStr -- A doctype for which a DTD is to be selected.
|
|
* @param aMimeType -- A mimetype for which a DTD is to be selected.
|
|
* Note: aParseMode might be required.
|
|
* @param aCommand -- A command for which a DTD is to be selected.
|
|
* @param aParseMode -- Used with aMimeType to choose the correct DTD.
|
|
* @return NS_OK if succeeded else ERROR.
|
|
*/
|
|
NS_IMETHODIMP nsParser::CreateCompatibleDTD(nsIDTD** aDTD,
|
|
nsString* aDocTypeStr,
|
|
eParserCommands aCommand,
|
|
const nsString* aMimeType,
|
|
nsDTDMode aDTDMode)
|
|
{
|
|
nsresult result=NS_OK;
|
|
const nsCID* theDTDClassID=0;
|
|
|
|
/**
|
|
* If the command is eViewNormal then we choose the DTD from
|
|
* either the DOCTYPE or form the MIMETYPE. DOCTYPE is given
|
|
* precedence over MIMETYPE. The passsed in DTD mode takes
|
|
* precedence over the DTD mode figured out from the DOCTYPE string.
|
|
* Ex. Assume the following:
|
|
* aDocTypeStr=<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
|
|
* aCommand=eViewNormal
|
|
* aMimeType=text/html
|
|
* aDTDMode=eDTDMode_strict
|
|
* The above example would invoke DetermineParseMode(). This would figure out
|
|
* a DTD mode ( eDTDMode_quirks ) and the doctype (eHTML4Text). Based on this
|
|
* info. NavDTD would be chosen. However, since the passed in mode (aDTDMode) requests
|
|
* for a strict the COtherDTD ( strict mode ) would get chosen rather than NavDTD.
|
|
* That is, aDTDMode overrides theDTDMode ( configured by the DOCTYPE ).The mime type
|
|
* will be taken into consideration only if a DOCTYPE string is not available.
|
|
*
|
|
* Usage ( a sample ):
|
|
*
|
|
* nsCOMPtr<nsIDTD> theDTD;
|
|
* nsAutoString theMimeType;
|
|
* nsAutoString theDocType;
|
|
*
|
|
* theDocType.AssignWithConversion("<!DOCTYPE>");
|
|
* theMimeType.AssignWithConversion("text/html");
|
|
*
|
|
* result=CreateCompatibleDTD(getter_AddRefs(theDTD),&theDocType,eViewNormal,&theMimeType,eDTDMode_quirks);
|
|
*
|
|
*/
|
|
|
|
if(aCommand==eViewNormal) {
|
|
if(aDocTypeStr) {
|
|
nsDTDMode theDTDMode=eDTDMode_unknown;
|
|
eParserDocType theDocType=ePlainText;
|
|
|
|
if(!aMimeType) {
|
|
nsAutoString temp;
|
|
DetermineParseMode(*aDocTypeStr,theDTDMode,theDocType,temp);
|
|
}
|
|
else DetermineParseMode(*aDocTypeStr,theDTDMode,theDocType,*aMimeType);
|
|
|
|
NS_ASSERTION(aDTDMode==eDTDMode_unknown || aDTDMode==theDTDMode,"aDTDMode overrides the mode selected from the DOCTYPE ");
|
|
|
|
if(aDTDMode!=eDTDMode_unknown) theDTDMode=aDTDMode; // aDTDMode takes precedence over theDTDMode
|
|
|
|
switch(theDocType) {
|
|
case eHTML4Text:
|
|
if((theDTDMode==eDTDMode_strict) || (theDTDMode==eDTDMode_transitional)) {
|
|
theDTDClassID=&kCOtherDTDCID;
|
|
break;
|
|
}
|
|
case eHTML3Text:
|
|
theDTDClassID=&kNavDTDCID;
|
|
break;
|
|
case eXHTMLText:
|
|
case eXMLText:
|
|
theDTDClassID=&kWellFormedDTDCID;
|
|
break;
|
|
default:
|
|
theDTDClassID=&kNavDTDCID;
|
|
break;
|
|
}
|
|
}
|
|
else if(aMimeType) {
|
|
|
|
NS_ASSERTION(aDTDMode!=eDTDMode_unknown,"DTD selection might require a parsemode");
|
|
|
|
if(aMimeType->EqualsWithConversion(kHTMLTextContentType)) {
|
|
if((aDTDMode==eDTDMode_strict) || (aDTDMode==eDTDMode_transitional)) {
|
|
theDTDClassID=&kCOtherDTDCID;
|
|
}
|
|
else {
|
|
theDTDClassID=&kNavDTDCID;
|
|
}
|
|
}
|
|
else if(aMimeType->EqualsWithConversion(kPlainTextContentType)) {
|
|
theDTDClassID=&kNavDTDCID;
|
|
}
|
|
else if(aMimeType->EqualsWithConversion(kXMLTextContentType) ||
|
|
aMimeType->EqualsWithConversion(kXULTextContentType) ||
|
|
aMimeType->EqualsWithConversion(kRDFTextContentType)) {
|
|
theDTDClassID=&kWellFormedDTDCID;
|
|
}
|
|
else {
|
|
theDTDClassID=&kNavDTDCID;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
if(aCommand==eViewSource) {
|
|
theDTDClassID=&kViewSourceDTDCID;
|
|
}
|
|
}
|
|
|
|
result=(theDTDClassID)? nsComponentManager::CreateInstance(*theDTDClassID, nsnull, NS_GET_IID(nsIDTD),(void**)aDTD):NS_OK;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
//#define TEST_DOCTYPES
|
|
#ifdef TEST_DOCTYPES
|
|
static const char* doctypes[] = {
|
|
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">",
|
|
|
|
//here are the XHTML doctypes we'll treat accordingly...
|
|
|
|
"<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Transitional//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD XHTML 1.0 Frameset//EN\">",
|
|
|
|
//here are a few HTML doctypes we'll treat as strict...
|
|
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",
|
|
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.0//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.01//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.0 STRICT//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",
|
|
|
|
"<!DOCTYPE \"ISO/IEC 15445:1999//DTD HyperText Markup Language//EN\">",
|
|
"<!DOCTYPE \"ISO/IEC 15445:1999//DTD HTML//EN\">",
|
|
"<!DOCTYPE \"-//SoftQuad Software//DTD HoTMetaL PRO 6.::19990601::extensions to HTML 4.//EN\">",
|
|
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 5.0//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 6.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">",
|
|
|
|
|
|
//these we treat as transitional (unless it's disabled)...
|
|
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.01 Transitional//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.1 Frameset//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.0 Transitional//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 4.0 Frameset//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
|
|
|
|
//these we treat as compatible with quirks... (along with any other we encounter)...
|
|
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.02 Transitional//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">",
|
|
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.00 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 6.01 Transitional//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" >",
|
|
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML Experimental 19960712//EN\">",
|
|
"<!DOCTYPE \"-//W3O//DTD W3 HTML 3.0//EN//\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML//EN//3.\">",
|
|
"<!DOCTYPE \"-//W3C//DTD W3 HTML 3.0//EN//\">",
|
|
"<!DOCTYPE \"-//W3C//DTD W3 HTML 3.0//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 3.0 1995-03-24//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 3.0//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 3.0//EN//\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 3//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 3//EN//3.0\">",
|
|
"<!DOCTYPE \"-//AS//DTD HTML 3 asWedit + extensions//EN\">",
|
|
"<!DOCTYPE \"-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict//EN//3.0\">",
|
|
"<!DOCTYPE \"-//W3C//DTD W3 HTML Strict 3//EN//\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 3//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 3//EN//3.0\">",
|
|
|
|
"<!DOCTYPE \"HTML\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML//EN//2\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.0//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 2//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 2//EN//2.0\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.0 Level 2//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 1//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 1//EN//2.0\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.0 Level 1//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 0//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Level 0//EN//2.0\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict//EN//2\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 2//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 2//EN//2.0\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.0 Strict//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.0 Strict Level 2//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 1//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 1//EN//2\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.0 Strict Level 1//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 0//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML Strict Level 0//EN//2.0\">",
|
|
"<!DOCTYPE \"-//WebTechs//DTD Mozilla HTML//EN\">",
|
|
"<!DOCTYPE \"-//WebTechs//DTD Mozilla HTML 2//EN\">",
|
|
"<!DOCTYPE \"-//Netscape Comm Corp //DTD HTML//EN\">",
|
|
"<!DOCTYPE \"-//Netscape Comm Corp //DTD Strict HTML//EN\">",
|
|
"<!DOCTYPE \"-//Microsoft//DTD Internet Explorer 2.0 HTML//EN\">",
|
|
"<!DOCTYPE \"-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN\">",
|
|
"<!DOCTYPE \"-//Microsoft//DTD Internet Explorer 2.0 Tables//EN\">",
|
|
"<!DOCTYPE \"-//Microsoft//DTD Internet Explorer 3.0 HTML//EN\">",
|
|
"<!DOCTYPE \"-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN\">",
|
|
"<!DOCTYPE \"-//Microsoft//DTD Internet Explorer 3.0 Tables//EN\">",
|
|
"<!DOCTYPE \"-//Sun Microsystems Corp DTD HotJava HTML//EN\">",
|
|
"<!DOCTYPE \"-//Sun Microsystems Corp //DTD HotJava Strict HTML//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML 2.1E//EN\">",
|
|
"<!DOCTYPE \"-//O'Reilly and Associates//DTD HTML Extended 1.0//EN\">",
|
|
"<!DOCTYPE \"-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//EN\">",
|
|
"<!DOCTYPE \"-//O'Reilly and Associates//DTD HTML 2.0//EN\">",
|
|
"<!DOCTYPE \"-//SQ//DTD HTML 2. HoTMetaL + extensions//EN\">",
|
|
"<!DOCTYPE \"-//Spyglass//DTD HTML 2.0 Extended//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 3.2//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 3.2 Final//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 3.2 Draft//EN\">",
|
|
"<!DOCTYPE \"-//W3C//DTD HTML 3.2S Draft//EN\">",
|
|
"<!DOCTYPE \"-//IETF//DTD HTML i18n//EN\">",
|
|
"<!DOCTYPE HTML PUBLIC \"-//SQ//DTD HTML 2.0 + all extensions//EN\" \"hmpro3.dtd\">",
|
|
0
|
|
};
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
/**
|
|
* This gets called just prior to the model actually
|
|
* being constructed. It's important to make this the
|
|
* last thing that happens right before parsing, so we
|
|
* can delay until the last moment the resolution of
|
|
* which DTD to use (unless of course we're assigned one).
|
|
*
|
|
* @update gess5/18/98
|
|
* @param
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::WillBuildModel(nsString& aFilename){
|
|
|
|
nsresult result=NS_OK;
|
|
|
|
|
|
#ifdef TEST_DOCTYPES
|
|
|
|
static PRBool tested=PR_FALSE;
|
|
const char** theDocType=doctypes;
|
|
|
|
if(!tested) {
|
|
tested=PR_TRUE;
|
|
nsDTDMode theMode=eDTDMode_unknown;
|
|
eParserDocType theDocumentType=ePlainText;
|
|
|
|
while(*theDocType) {
|
|
nsAutoString theType;
|
|
theType.AssignWithConversion(*theDocType);
|
|
DetermineParseMode(theType,theMode,theDocumentType,mParserContext->mMimeType);
|
|
theDocType++;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if(mParserContext){
|
|
if(eUnknownDetect==mParserContext->mAutoDetectStatus) {
|
|
mMajorIteration=-1;
|
|
mMinorIteration=-1;
|
|
|
|
nsAutoString theBuffer;
|
|
// XXXVidur Make a copy and only check in the first 1k
|
|
mParserContext->mScanner->Peek(theBuffer, 1024);
|
|
|
|
DetermineParseMode(theBuffer,mParserContext->mDTDMode,mParserContext->mDocType,mParserContext->mMimeType);
|
|
|
|
if(PR_TRUE==FindSuitableDTD(*mParserContext,theBuffer)) {
|
|
mParserContext->mDTD->WillBuildModel( *mParserContext,mSink);
|
|
}//if
|
|
}//if
|
|
}
|
|
else result=kInvalidParserContext;
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* This gets called when the parser is done with its input.
|
|
* Note that the parser may have been called recursively, so we
|
|
* have to check for a prev. context before closing out the DTD/sink.
|
|
* @update gess5/18/98
|
|
* @param
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::DidBuildModel(nsresult anErrorCode) {
|
|
//One last thing...close any open containers.
|
|
nsresult result=anErrorCode;
|
|
|
|
if(mParserContext && !mParserContext->mPrevContext) {
|
|
if(mParserContext->mDTD) {
|
|
result=mParserContext->mDTD->DidBuildModel(anErrorCode,PRBool(0==mParserContext->mPrevContext),this,mSink);
|
|
}
|
|
//Ref. to bug 61462.
|
|
NS_IF_RELEASE(mBundle);
|
|
}//if
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* This method adds a new parser context to the list,
|
|
* pushing the current one to the next position.
|
|
* @update gess7/22/98
|
|
* @param ptr to new context
|
|
* @return nada
|
|
*/
|
|
void nsParser::PushContext(CParserContext& aContext) {
|
|
aContext.mPrevContext=mParserContext;
|
|
mParserContext=&aContext;
|
|
}
|
|
|
|
/**
|
|
* This method pops the topmost context off the stack,
|
|
* returning it to the user. The next context (if any)
|
|
* becomes the current context.
|
|
* @update gess7/22/98
|
|
* @return prev. context
|
|
*/
|
|
CParserContext* nsParser::PopContext() {
|
|
CParserContext* oldContext=mParserContext;
|
|
if(oldContext) {
|
|
mParserContext=oldContext->mPrevContext;
|
|
// If the old context was blocked, propogate the blocked state
|
|
// back to the new one. Also, propagate the stream listener state
|
|
// but don't override onStop state to guarantee the call to DidBuildModel().
|
|
if (mParserContext) {
|
|
if(mParserContext->mStreamListenerState!=eOnStop) {
|
|
mParserContext->mStreamListenerState = oldContext->mStreamListenerState;
|
|
}
|
|
}
|
|
}
|
|
return oldContext;
|
|
}
|
|
|
|
/**
|
|
* Call this when you want control whether or not the parser will parse
|
|
* and tokenize input (TRUE), or whether it just caches input to be
|
|
* parsed later (FALSE).
|
|
*
|
|
* @update gess 1/29/99
|
|
* @param aState determines whether we parse/tokenize or just cache.
|
|
* @return current state
|
|
*/
|
|
void nsParser::SetUnusedInput(nsString& aBuffer) {
|
|
mUnusedInput=aBuffer;
|
|
}
|
|
|
|
/**
|
|
* Call this when you want to *force* the parser to terminate the
|
|
* parsing process altogether. This is binary -- so once you terminate
|
|
* you can't resume without restarting altogether.
|
|
*
|
|
* @update gess 7/4/99
|
|
* @return should return NS_OK once implemented
|
|
*/
|
|
nsresult nsParser::Terminate(void){
|
|
nsresult result=NS_OK;
|
|
if(mParserContext && mParserContext->mDTD) {
|
|
result=mParserContext->mDTD->Terminate(this);
|
|
if(result==NS_ERROR_HTMLPARSER_STOPPARSING) {
|
|
// XXX - [ until we figure out a way to break parser-sink circularity ]
|
|
// Hack - Hold a reference until we are completely done...
|
|
nsCOMPtr<nsIParser> kungFuDeathGrip(this);
|
|
mInternalState=result;
|
|
DidBuildModel(result);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
*
|
|
* @update gess 1/29/99
|
|
* @param aState determines whether we parse/tokenize or just cache.
|
|
* @return current state
|
|
*/
|
|
nsresult nsParser::ContinueParsing(){
|
|
|
|
// If the stream has already finished, there's a good chance
|
|
// that we might start closing things down when the parser
|
|
// is reenabled. To make sure that we're not deleted across
|
|
// the reenabling process, hold a reference to ourselves.
|
|
nsresult result=NS_OK;
|
|
nsCOMPtr<nsIParser> kungFuDeathGrip(this);
|
|
|
|
mParserEnabled=PR_TRUE;
|
|
|
|
PRBool isFinalChunk=(mParserContext && mParserContext->mStreamListenerState==eOnStop)? PR_TRUE:PR_FALSE;
|
|
|
|
result=ResumeParse(PR_TRUE,isFinalChunk); // Ref. bug 57999
|
|
|
|
if(result!=NS_OK)
|
|
result=mInternalState;
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Stops parsing temporarily. That's it will prevent the
|
|
* parser from building up content model.
|
|
*
|
|
* @update
|
|
* @return
|
|
*/
|
|
void nsParser::BlockParser() {
|
|
mParserEnabled=PR_FALSE;
|
|
MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::BlockParser(), this=%p\n", this));
|
|
MOZ_TIMER_STOP(mParseTime);
|
|
}
|
|
|
|
/**
|
|
* Open up the parser for tokenization, building up content
|
|
* model..etc. However, this method does not resume parsing
|
|
* automatically. It's the callers' responsibility to restart
|
|
* the parsing engine.
|
|
*
|
|
* @update
|
|
* @return
|
|
*/
|
|
void nsParser::UnblockParser() {
|
|
mParserEnabled=PR_TRUE;
|
|
MOZ_TIMER_DEBUGLOG(("Start: Parse Time: nsParser::UnblockParser(), this=%p\n", this));
|
|
MOZ_TIMER_START(mParseTime);
|
|
}
|
|
|
|
/**
|
|
* Call this to query whether the parser is enabled or not.
|
|
*
|
|
* @update vidur 4/12/99
|
|
* @return current state
|
|
*/
|
|
PRBool nsParser::IsParserEnabled() {
|
|
return mParserEnabled;
|
|
}
|
|
|
|
|
|
/**
|
|
* This is the main controlling routine in the parsing process.
|
|
* Note that it may get called multiple times for the same scanner,
|
|
* since this is a pushed based system, and all the tokens may
|
|
* not have been consumed by the scanner during a given invocation
|
|
* of this method.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param aFilename -- const char* containing file to be parsed.
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::Parse(nsIURI* aURL,nsIRequestObserver* aListener,PRBool aVerifyEnabled, void* aKey,nsDTDMode aMode) {
|
|
|
|
NS_PRECONDITION(0!=aURL,kNullURL);
|
|
|
|
nsresult result=kBadURL;
|
|
mObserver = aListener;
|
|
NS_IF_ADDREF(mObserver);
|
|
mDTDVerification=aVerifyEnabled;
|
|
if(aURL) {
|
|
char* spec;
|
|
nsresult rv = aURL->GetSpec(&spec);
|
|
if (rv != NS_OK) {
|
|
return rv;
|
|
}
|
|
nsAutoString theName; theName.AssignWithConversion(spec);
|
|
nsCRT::free(spec);
|
|
|
|
nsScanner* theScanner=new nsScanner(theName,PR_FALSE,mCharset,mCharsetSource);
|
|
CParserContext* pc=new CParserContext(theScanner,aKey,mCommand,aListener);
|
|
if(pc && theScanner) {
|
|
pc->mMultipart=PR_TRUE;
|
|
pc->mContextType=CParserContext::eCTURL;
|
|
PushContext(*pc);
|
|
result=NS_OK;
|
|
}
|
|
else{
|
|
result=mInternalState=NS_ERROR_HTMLPARSER_BADCONTEXT;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* Cause parser to parse input from given stream
|
|
* @update vidur 12/11/98
|
|
* @param aStream is the i/o source
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::Parse(nsIInputStream& aStream,const nsString& aMimeType,PRBool aVerifyEnabled, void* aKey,nsDTDMode aMode){
|
|
|
|
mDTDVerification=aVerifyEnabled;
|
|
nsresult result=NS_ERROR_OUT_OF_MEMORY;
|
|
|
|
//ok, time to create our tokenizer and begin the process
|
|
nsAutoString theUnknownFilename; theUnknownFilename.AssignWithConversion("unknown");
|
|
|
|
nsInputStream input(&aStream);
|
|
|
|
nsScanner* theScanner=new nsScanner(theUnknownFilename,input,mCharset,mCharsetSource);
|
|
CParserContext* pc=new CParserContext(theScanner,aKey,mCommand,0);
|
|
if(pc && theScanner) {
|
|
PushContext(*pc);
|
|
pc->SetMimeType(aMimeType);
|
|
pc->mStreamListenerState=eOnStart;
|
|
pc->mMultipart=PR_FALSE;
|
|
pc->mContextType=CParserContext::eCTStream;
|
|
mParserContext->mScanner->Eof();
|
|
result=ResumeParse();
|
|
pc=PopContext();
|
|
delete pc;
|
|
}
|
|
else{
|
|
result=mInternalState=NS_ERROR_HTMLPARSER_BADCONTEXT;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* Call this method if all you want to do is parse 1 string full of HTML text.
|
|
* In particular, this method should be called by the DOM when it has an HTML
|
|
* string to feed to the parser in real-time.
|
|
*
|
|
* @update gess5/11/98
|
|
* @param aSourceBuffer contains a string-full of real content
|
|
* @param aMimeType tells us what type of content to expect in the given string
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::Parse(const nsAReadableString& aSourceBuffer,void* aKey,const nsString&
|
|
aMimeType,PRBool aVerifyEnabled,PRBool aLastCall,nsDTDMode aMode){
|
|
|
|
//NOTE: Make sure that updates to this method don't cause
|
|
// bug #2361 to break again!
|
|
|
|
nsresult result=NS_OK;
|
|
nsParser* me = this;
|
|
// Maintain a reference to ourselves so we don't go away
|
|
// till we're completely done.
|
|
NS_ADDREF(me);
|
|
|
|
if(aSourceBuffer.Length() || mUnusedInput.Length()) {
|
|
mDTDVerification=aVerifyEnabled;
|
|
CParserContext* pc=0;
|
|
|
|
if((!mParserContext) || (mParserContext->mKey!=aKey)) {
|
|
//only make a new context if we dont have one, OR if we do, but has a different context key...
|
|
|
|
nsScanner* theScanner=new nsScanner(mUnusedInput,mCharset,mCharsetSource);
|
|
nsIDTD *theDTD=0;
|
|
eAutoDetectResult theStatus=eUnknownDetect;
|
|
|
|
if(mParserContext && (mParserContext->mMimeType==aMimeType)) {
|
|
mParserContext->mDTD->CreateNewInstance(&theDTD); // To fix 32263
|
|
theStatus=mParserContext->mAutoDetectStatus;
|
|
|
|
//added this to fix bug 32022.
|
|
}
|
|
|
|
pc=new CParserContext(theScanner,aKey, mCommand,0,theDTD,theStatus,aLastCall);
|
|
|
|
if(pc && theScanner) {
|
|
PushContext(*pc);
|
|
|
|
pc->mMultipart=!aLastCall; //by default
|
|
if (pc->mPrevContext) {
|
|
pc->mMultipart |= pc->mPrevContext->mMultipart; //if available
|
|
}
|
|
|
|
// start fix bug 40143
|
|
if(pc->mMultipart) {
|
|
pc->mStreamListenerState=eOnDataAvail;
|
|
if(pc->mScanner) pc->mScanner->SetIncremental(PR_TRUE);
|
|
}
|
|
else {
|
|
pc->mStreamListenerState=eOnStop;
|
|
if(pc->mScanner) pc->mScanner->SetIncremental(PR_FALSE);
|
|
}
|
|
// end fix for 40143
|
|
|
|
pc->mContextType=CParserContext::eCTString;
|
|
pc->SetMimeType(aMimeType);
|
|
mUnusedInput.Truncate(0);
|
|
|
|
//printf("Parse(string) iterate: %i",PR_FALSE);
|
|
pc->mScanner->Append(aSourceBuffer);
|
|
result=ResumeParse(PR_FALSE);
|
|
|
|
}
|
|
else {
|
|
NS_RELEASE(me);
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
NS_IF_RELEASE(theDTD);
|
|
}
|
|
else {
|
|
mParserContext->mScanner->Append(aSourceBuffer);
|
|
if(!mParserContext->mPrevContext) {
|
|
// Set stream listener state to eOnStop, on the final context - Fix 68160,
|
|
// to guarantee DidBuildModel() call - Fix 36148
|
|
if(aLastCall) {
|
|
mParserContext->mStreamListenerState=eOnStop;
|
|
}
|
|
ResumeParse(PR_FALSE);
|
|
}
|
|
}
|
|
}//if
|
|
NS_RELEASE(me);
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @update gess 04/01/99
|
|
* @param
|
|
* @return
|
|
*/
|
|
nsresult nsParser::ParseFragment(const nsAReadableString& aSourceBuffer,void* aKey,nsITagStack& aStack,PRUint32 anInsertPos,const nsString& aMimeType,nsDTDMode aMode){
|
|
|
|
nsresult result=NS_OK;
|
|
nsAutoString theContext;
|
|
PRUint32 theCount=aStack.GetSize();
|
|
PRUint32 theIndex=0;
|
|
while(theIndex++<theCount){
|
|
theContext.AppendWithConversion("<");
|
|
theContext.Append(aStack.TagAt(theCount-theIndex));
|
|
theContext.AppendWithConversion(">");
|
|
}
|
|
theContext.AppendWithConversion("<endnote>"); //XXXHack! I'll make this better later.
|
|
nsAutoString theBuffer(theContext);
|
|
|
|
#if 0
|
|
//use this to force a buffer-full of content as part of a paste operation...
|
|
theBuffer.Append("<title>title</title><a href=\"one\">link</a>");
|
|
#else
|
|
|
|
//#define USEFILE
|
|
#ifdef USEFILE
|
|
|
|
const char* theFile="c:/temp/rhp.html";
|
|
fstream input(theFile,ios::in);
|
|
char buffer[1024];
|
|
int count=1;
|
|
while(count) {
|
|
input.getline(buffer,sizeof(buffer));
|
|
count=input.gcount();
|
|
if(0<count) {
|
|
buffer[count-1]=0;
|
|
theBuffer.Append(buffer,count-1);
|
|
}
|
|
}
|
|
|
|
#else
|
|
//this is the normal code path for paste...
|
|
theBuffer.Append(aSourceBuffer);
|
|
#endif
|
|
|
|
#endif
|
|
|
|
if(theBuffer.Length()){
|
|
//now it's time to try to build the model from this fragment
|
|
|
|
mObserversEnabled=PR_FALSE; //disable observers for fragments
|
|
result=Parse(theBuffer,(void*)&theBuffer,aMimeType,PR_FALSE,PR_TRUE);
|
|
mObserversEnabled=PR_TRUE; //now reenable.
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* This routine is called to cause the parser to continue parsing it's underlying stream.
|
|
* This call allows the parse process to happen in chunks, such as when the content is push
|
|
* based, and we need to parse in pieces.
|
|
*
|
|
* An interesting change in how the parser gets used has led us to add extra processing to this method.
|
|
* The case occurs when the parser is blocked in one context, and gets a parse(string) call in another context.
|
|
* In this case, the parserContexts are linked. No problem.
|
|
*
|
|
* The problem is that Parse(string) assumes that it can proceed unabated, but if the parser is already
|
|
* blocked that assumption is false. So we needed to add a mechanism here to allow the parser to continue
|
|
* to process (the pop and free) contexts until 1) it get's blocked again; 2) it runs out of contexts.
|
|
*
|
|
*
|
|
* @update rickg 03.10.2000
|
|
* @param allowItertion : set to true if non-script resumption is requested
|
|
* @param aIsFinalChunk : tells us when the last chunk of data is provided.
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::ResumeParse(PRBool allowIteration, PRBool aIsFinalChunk) {
|
|
|
|
//printf(" Resume %i, prev-context: %p\n",allowIteration,mParserContext->mPrevContext);
|
|
|
|
|
|
nsresult result=NS_OK;
|
|
|
|
if(mParserEnabled && mInternalState!=NS_ERROR_HTMLPARSER_STOPPARSING) {
|
|
|
|
|
|
MOZ_TIMER_DEBUGLOG(("Start: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
|
|
MOZ_TIMER_START(mParseTime);
|
|
|
|
result=WillBuildModel(mParserContext->mScanner->GetFilename());
|
|
if(mParserContext->mDTD) {
|
|
|
|
mParserContext->mDTD->WillResumeParse();
|
|
PRBool theFirstTime=PR_TRUE;
|
|
PRBool theIterationIsOk=(theFirstTime || allowIteration||(!mParserContext->mPrevContext));
|
|
|
|
while((result==NS_OK) && (theIterationIsOk)) {
|
|
theFirstTime=PR_FALSE;
|
|
if(mUnusedInput.Length()>0) {
|
|
if(mParserContext->mScanner) {
|
|
// -- Ref: Bug# 22485 --
|
|
// Insert the unused input into the source buffer
|
|
// as if it was read from the input stream.
|
|
// Adding UngetReadable() per vidur!!
|
|
mParserContext->mScanner->UngetReadable(mUnusedInput);
|
|
mUnusedInput.Truncate(0);
|
|
}
|
|
}
|
|
|
|
nsresult theTokenizerResult=Tokenize(aIsFinalChunk); // kEOF==2152596456
|
|
result=BuildModel();
|
|
|
|
theIterationIsOk=PRBool(kEOF!=theTokenizerResult);
|
|
|
|
// Make sure not to stop parsing too early. Therefore, before shutting down the
|
|
// parser, it's important to check whether the input buffer has been scanned to
|
|
// completion ( theTokenizerResult should be kEOF ). kEOF -> End of buffer.
|
|
|
|
// If we're told to block the parser, we disable all further parsing
|
|
// (and cache any data coming in) until the parser is re-enabled.
|
|
|
|
if(NS_ERROR_HTMLPARSER_BLOCK==result) {
|
|
//BLOCK == 2152596464
|
|
mParserContext->mDTD->WillInterruptParse();
|
|
BlockParser();
|
|
return NS_OK;
|
|
}
|
|
|
|
else if (NS_ERROR_HTMLPARSER_STOPPARSING==result) {
|
|
mInternalState=result;
|
|
// Note: Parser Terminate() calls DidBuildModel.
|
|
if(NS_ERROR_HTMLPARSER_STOPPARSING!=theTokenizerResult) {
|
|
DidBuildModel(mStreamStatus);
|
|
}
|
|
break;
|
|
}
|
|
|
|
else if((NS_OK==result) && (theTokenizerResult==kEOF)){
|
|
|
|
PRBool theContextIsStringBased=PRBool(CParserContext::eCTString==mParserContext->mContextType);
|
|
if( (eOnStop==mParserContext->mStreamListenerState) ||
|
|
(!mParserContext->mMultipart) || theContextIsStringBased) {
|
|
|
|
if(!mParserContext->mPrevContext) {
|
|
if(eOnStop==mParserContext->mStreamListenerState) {
|
|
|
|
DidBuildModel(mStreamStatus);
|
|
|
|
MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
|
|
MOZ_TIMER_STOP(mParseTime);
|
|
|
|
MOZ_TIMER_LOG(("Parse Time (this=%p): ", this));
|
|
MOZ_TIMER_PRINT(mParseTime);
|
|
|
|
MOZ_TIMER_LOG(("DTD Time: "));
|
|
MOZ_TIMER_PRINT(mDTDTime);
|
|
|
|
MOZ_TIMER_LOG(("Tokenize Time: "));
|
|
MOZ_TIMER_PRINT(mTokenizeTime);
|
|
|
|
return result;
|
|
}
|
|
|
|
}
|
|
else {
|
|
|
|
CParserContext* theContext=PopContext();
|
|
if(theContext) {
|
|
theIterationIsOk=PRBool(allowIteration && theContextIsStringBased);
|
|
if(theContext->mCopyUnused) {
|
|
theContext->mScanner->CopyUnusedData(mUnusedInput);
|
|
}
|
|
delete theContext;
|
|
}
|
|
result = mInternalState;
|
|
//...then intentionally fall through to WillInterruptParse()...
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(kEOF==theTokenizerResult) {
|
|
mParserContext->mDTD->WillInterruptParse();
|
|
}
|
|
|
|
}//while
|
|
}//if
|
|
else {
|
|
mInternalState=result=NS_ERROR_HTMLPARSER_UNRESOLVEDDTD;
|
|
}
|
|
}//if
|
|
|
|
MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
|
|
MOZ_TIMER_STOP(mParseTime);
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* This is where we loop over the tokens created in the
|
|
* tokenization phase, and try to make sense out of them.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::BuildModel() {
|
|
|
|
//nsDequeIterator e=mParserContext->mTokenDeque.End();
|
|
|
|
// if(!mParserContext->mCurrentPos)
|
|
// mParserContext->mCurrentPos=new nsDequeIterator(mParserContext->mTokenDeque.Begin());
|
|
|
|
//Get the root DTD for use in model building...
|
|
|
|
CParserContext* theRootContext=mParserContext;
|
|
nsITokenizer* theTokenizer=0;
|
|
|
|
nsresult result=mParserContext->mDTD->GetTokenizer(theTokenizer);
|
|
if(theTokenizer){
|
|
|
|
while(theRootContext->mPrevContext) {
|
|
theRootContext=theRootContext->mPrevContext;
|
|
}
|
|
|
|
nsIDTD* theRootDTD=theRootContext->mDTD;
|
|
if(theRootDTD) {
|
|
MOZ_TIMER_START(mDTDTime);
|
|
result=theRootDTD->BuildModel(this,theTokenizer,mTokenObserver,mSink);
|
|
MOZ_TIMER_STOP(mDTDTime);
|
|
}
|
|
}
|
|
else{
|
|
mInternalState=result=NS_ERROR_HTMLPARSER_BADTOKENIZER;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
*
|
|
* @update gess1/22/99
|
|
* @param
|
|
* @return
|
|
*/
|
|
nsITokenizer* nsParser::GetTokenizer(void) {
|
|
nsITokenizer* theTokenizer=0;
|
|
if(mParserContext && mParserContext->mDTD) {
|
|
mParserContext->mDTD->GetTokenizer(theTokenizer);
|
|
}
|
|
return theTokenizer;
|
|
}
|
|
|
|
/*******************************************************************
|
|
These methods are used to talk to the netlib system...
|
|
*******************************************************************/
|
|
|
|
/**
|
|
*
|
|
*
|
|
* @update gess 5/12/98
|
|
* @param
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult
|
|
nsParser::OnProgress(nsIRequest *request, nsISupports* aContext, PRUint32 aProgress, PRUint32 aProgressMax)
|
|
{
|
|
nsresult result=0;
|
|
if (nsnull != mProgressEventSink) {
|
|
mProgressEventSink->OnProgress(request, aContext, aProgress, aProgressMax);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
*
|
|
*
|
|
* @update gess 5/12/98
|
|
* @param
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult
|
|
nsParser::OnStatus(nsIRequest *request, nsISupports* aContext,
|
|
nsresult aStatus, const PRUnichar* aStatusArg)
|
|
{
|
|
nsresult rv;
|
|
if (nsnull != mProgressEventSink) {
|
|
rv = mProgressEventSink->OnStatus(request, aContext, aStatus, aStatusArg);
|
|
NS_ASSERTION(NS_SUCCEEDED(rv), "dropping error result");
|
|
}
|
|
return NS_OK;
|
|
}
|
|
|
|
#ifdef rickgdebug
|
|
#include <fstream.h>
|
|
fstream* gOutFile;
|
|
#endif
|
|
|
|
/**
|
|
*
|
|
*
|
|
* @update gess 5/12/98
|
|
* @param
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
|
|
|
|
NS_PRECONDITION((eNone==mParserContext->mStreamListenerState),kBadListenerInit);
|
|
|
|
if (nsnull != mObserver) {
|
|
mObserver->OnStartRequest(request, aContext);
|
|
}
|
|
mParserContext->mStreamListenerState=eOnStart;
|
|
mParserContext->mAutoDetectStatus=eUnknownDetect;
|
|
mParserContext->mRequest=request;
|
|
mParserContext->mDTD=0;
|
|
nsresult rv;
|
|
char* contentType = nsnull;
|
|
nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
|
|
NS_ASSERTION(channel, "parser needs a channel to find a dtd");
|
|
|
|
rv = channel->GetContentType(&contentType);
|
|
if (NS_SUCCEEDED(rv))
|
|
{
|
|
mParserContext->SetMimeType( NS_ConvertASCIItoUCS2(contentType) );
|
|
nsCRT::free(contentType);
|
|
}
|
|
else
|
|
NS_ASSERTION(contentType, "parser needs a content type to find a dtd");
|
|
|
|
#ifdef rickgdebug
|
|
gOutFile= new fstream("c:/temp/out.file",ios::trunc);
|
|
#endif
|
|
|
|
return NS_OK;
|
|
}
|
|
|
|
|
|
#define UCS2_BE "UTF-16BE"
|
|
#define UCS2_LE "UTF-16LE"
|
|
#define UCS4_BE "UTF-32BE"
|
|
#define UCS4_LE "UTF-32LE"
|
|
#define UCS4_2143 "X-ISO-10646-UCS-4-2143"
|
|
#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
|
|
#define UTF8 "UTF-8"
|
|
|
|
static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsString& oCharset, nsCharsetSource& oCharsetSource) {
|
|
oCharsetSource= kCharsetFromAutoDetection;
|
|
oCharset.SetLength(0);
|
|
// see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
|
|
// for details
|
|
// Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
|
|
// We need to check that
|
|
// UCS2 BOM FEFF = UTF8 EF BB BF
|
|
switch(aBytes[0])
|
|
{
|
|
case 0x00:
|
|
if(0x00==aBytes[1]) {
|
|
// 00 00
|
|
if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
|
|
// 00 00 00 3C UCS-4, big-endian machine (1234 order)
|
|
oCharset.AssignWithConversion(UCS4_BE);
|
|
} else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) {
|
|
// 00 00 3C 00 UCS-4, unusual octet order (2143)
|
|
oCharset.AssignWithConversion(UCS4_2143);
|
|
}
|
|
} else if(0x3C==aBytes[1]) {
|
|
// 00 3C
|
|
if((0x00==aBytes[2]) && (0x00==aBytes[3])) {
|
|
// 00 3C 00 00 UCS-4, unusual octet order (3412)
|
|
oCharset.AssignWithConversion(UCS4_3412);
|
|
} else if((0x3C==aBytes[2]) && (0x3F==aBytes[3])) {
|
|
// 00 3C 00 3F UTF-16, big-endian, no Byte Order Mark
|
|
oCharset.AssignWithConversion(UCS2_BE); // should change to UTF-16BE
|
|
}
|
|
}
|
|
break;
|
|
case 0x3C:
|
|
if(0x00==aBytes[1]) {
|
|
// 3C 00
|
|
if((0x00==aBytes[2]) && (0x00==aBytes[3])) {
|
|
// 3C 00 00 00 UCS-4, little-endian machine (4321 order)
|
|
oCharset.AssignWithConversion(UCS4_LE);
|
|
} else if((0x3F==aBytes[2]) && (0x00==aBytes[3])) {
|
|
// 3C 00 3F 00 UTF-16, little-endian, no Byte Order Mark
|
|
oCharset.AssignWithConversion(UCS2_LE); // should change to UTF-16LE
|
|
}
|
|
} else if((0x3C==aBytes[0]) && (0x3F==aBytes[1]) &&
|
|
(0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
|
|
(0 == PL_strncmp("<?xml version", (char*)aBytes, 13 ))) {
|
|
// 3C 3F 78 6D
|
|
nsAutoString firstXbytes;
|
|
firstXbytes.AppendWithConversion((const char*)aBytes, (PRInt32)
|
|
((aLen > XMLENCODING_PEEKBYTES)?
|
|
XMLENCODING_PEEKBYTES:
|
|
aLen));
|
|
PRInt32 xmlDeclEnd = firstXbytes.Find("?>", PR_FALSE, 13);
|
|
// 27 == strlen("<xml? version="1" encoding=");
|
|
if((kNotFound != xmlDeclEnd) &&(xmlDeclEnd > 27 )){
|
|
firstXbytes.Cut(xmlDeclEnd, firstXbytes.Length()-xmlDeclEnd);
|
|
PRInt32 encStart = firstXbytes.Find("encoding", PR_FALSE,13);
|
|
if(kNotFound != encStart) {
|
|
encStart = firstXbytes.FindCharInSet("\"'", encStart+8);
|
|
// 8 == strlen("encoding")
|
|
if(kNotFound != encStart) {
|
|
PRUnichar q = firstXbytes.CharAt(encStart);
|
|
PRInt32 encEnd = firstXbytes.FindChar(q, PR_FALSE, encStart+1);
|
|
if(kNotFound != encEnd) {
|
|
PRInt32 count = encEnd - encStart -1;
|
|
if(count >0) {
|
|
const PRUnichar *u = firstXbytes.GetUnicode();
|
|
// if UTF-16, it should have been detected by now
|
|
// otherwise, the label must be invalid
|
|
if (nsCRT::strncasecmp(&u[encStart+1], NS_LITERAL_STRING("UTF-16").get(), count)) {
|
|
firstXbytes.Mid(oCharset,(encStart+1), count);
|
|
oCharsetSource= kCharsetFromMetaTag;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case 0xEF:
|
|
if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
|
|
// EF BB BF
|
|
// Win2K UTF-8 BOM
|
|
oCharset.AssignWithConversion(UTF8);
|
|
oCharsetSource= kCharsetFromByteOrderMark;
|
|
}
|
|
break;
|
|
case 0xFE:
|
|
if(0xFF==aBytes[1]) {
|
|
// FE FF
|
|
// UTF-16, big-endian
|
|
oCharset.AssignWithConversion(UCS2_BE); // should change to UTF-16BE
|
|
oCharsetSource= kCharsetFromByteOrderMark;
|
|
}
|
|
break;
|
|
case 0xFF:
|
|
if(0xFE==aBytes[1]) {
|
|
// FF FE
|
|
// UTF-16, little-endian
|
|
oCharset.AssignWithConversion(UCS2_LE); // should change to UTF-16LE
|
|
oCharsetSource= kCharsetFromByteOrderMark;
|
|
}
|
|
break;
|
|
// case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
|
|
// We do not care EBCIDIC here....
|
|
// }
|
|
// break;
|
|
} // switch
|
|
return oCharset.Length() > 0;
|
|
}
|
|
|
|
typedef struct {
|
|
PRBool mNeedCheckFirst4Bytes;
|
|
nsParser* mParser;
|
|
nsIParserFilter* mParserFilter;
|
|
nsScanner* mScanner;
|
|
} ParserWriteStruct;
|
|
|
|
/*
|
|
* This function is invoked as a result of a call to a stream's
|
|
* ReadSegments() method. It is called for each contiguous buffer
|
|
* of data in the underlying stream or pipe. Using ReadSegments
|
|
* allows us to avoid copying data to read out of the stream.
|
|
*/
|
|
static NS_METHOD
|
|
ParserWriteFunc(nsIInputStream* in,
|
|
void* closure,
|
|
const char* fromRawSegment,
|
|
PRUint32 toOffset,
|
|
PRUint32 count,
|
|
PRUint32 *writeCount)
|
|
{
|
|
nsresult result;
|
|
ParserWriteStruct* pws = NS_STATIC_CAST(ParserWriteStruct*, closure);
|
|
const char* buf = fromRawSegment;
|
|
PRUint32 theNumRead = count;
|
|
|
|
if (!pws) {
|
|
return NS_ERROR_FAILURE;
|
|
}
|
|
|
|
if(pws->mNeedCheckFirst4Bytes && (count >= 4)) {
|
|
nsCharsetSource guessSource;
|
|
nsAutoString guess;
|
|
|
|
pws->mNeedCheckFirst4Bytes = PR_FALSE;
|
|
if(detectByteOrderMark((const unsigned char*)buf,
|
|
theNumRead, guess, guessSource))
|
|
{
|
|
#ifdef DEBUG_XMLENCODING
|
|
printf("xmlencoding detect- %s\n", guess.ToNewCString());
|
|
#endif
|
|
pws->mParser->SetDocumentCharset(guess, guessSource);
|
|
}
|
|
}
|
|
|
|
if(pws->mParserFilter)
|
|
pws->mParserFilter->RawBuffer(buf, &theNumRead);
|
|
|
|
result = pws->mScanner->Append(buf, theNumRead);
|
|
if (NS_SUCCEEDED(result)) {
|
|
*writeCount = count;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
*
|
|
*
|
|
* @update gess 1/4/99
|
|
* @param pIStream contains the input chars
|
|
* @param length is the number of bytes waiting input
|
|
* @return error code (usually 0)
|
|
*/
|
|
|
|
nsresult nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,
|
|
nsIInputStream *pIStream, PRUint32 sourceOffset, PRUint32 aLength)
|
|
{
|
|
|
|
|
|
NS_PRECONDITION(((eOnStart==mParserContext->mStreamListenerState)||(eOnDataAvail==mParserContext->mStreamListenerState)),kOnStartNotCalled);
|
|
|
|
nsresult result=NS_OK;
|
|
|
|
CParserContext *theContext=mParserContext;
|
|
|
|
while(theContext) {
|
|
if(theContext->mRequest!=request && theContext->mPrevContext)
|
|
theContext=theContext->mPrevContext;
|
|
else break;
|
|
}
|
|
|
|
if(theContext && theContext->mRequest==request) {
|
|
|
|
theContext->mStreamListenerState=eOnDataAvail;
|
|
|
|
if(eInvalidDetect==theContext->mAutoDetectStatus) {
|
|
if(theContext->mScanner) {
|
|
nsReadingIterator<PRUnichar> iter;
|
|
theContext->mScanner->EndReading(iter);
|
|
theContext->mScanner->SetPosition(iter, PR_TRUE);
|
|
}
|
|
}
|
|
|
|
PRUint32 totalRead;
|
|
ParserWriteStruct pws;
|
|
pws.mNeedCheckFirst4Bytes =
|
|
((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
|
|
pws.mParser = this;
|
|
pws.mParserFilter = mParserFilter;
|
|
pws.mScanner = theContext->mScanner;
|
|
|
|
result = pIStream->ReadSegments(ParserWriteFunc, (void*)&pws, aLength, &totalRead);
|
|
if (NS_FAILED(result)) {
|
|
return result;
|
|
}
|
|
|
|
result=ResumeParse();
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* This is called by the networking library once the last block of data
|
|
* has been collected from the net.
|
|
*
|
|
* @update gess 04/01/99
|
|
* @param
|
|
* @return
|
|
*/
|
|
nsresult nsParser::OnStopRequest(nsIRequest *request, nsISupports* aContext,
|
|
nsresult status)
|
|
{
|
|
|
|
nsresult result=NS_OK;
|
|
|
|
if(eOnStart==mParserContext->mStreamListenerState) {
|
|
|
|
//If you're here, then OnDataAvailable() never got called.
|
|
//Prior to necko, we never dealt with this case, but the problem may have existed.
|
|
//What we'll do (for now at least) is construct a blank HTML document.
|
|
nsAutoString temp; temp.AssignWithConversion("<html><body></body></html>");
|
|
mParserContext->mScanner->Append(temp);
|
|
result=ResumeParse(PR_TRUE,PR_TRUE);
|
|
}
|
|
|
|
mParserContext->mStreamListenerState=eOnStop;
|
|
mStreamStatus=status;
|
|
|
|
if(mParserFilter)
|
|
mParserFilter->Finish();
|
|
|
|
mParserContext->mScanner->SetIncremental(PR_FALSE);
|
|
result=ResumeParse(PR_TRUE,PR_TRUE);
|
|
|
|
// If the parser isn't enabled, we don't finish parsing till
|
|
// it is reenabled.
|
|
|
|
|
|
// XXX Should we wait to notify our observers as well if the
|
|
// parser isn't yet enabled?
|
|
if (nsnull != mObserver) {
|
|
mObserver->OnStopRequest(request, aContext, status);
|
|
}
|
|
|
|
#ifdef rickgdebug
|
|
if(gOutFile){
|
|
gOutFile->close();
|
|
delete gOutFile;
|
|
gOutFile=0;
|
|
}
|
|
#endif
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/*******************************************************************
|
|
Here comes the tokenization methods...
|
|
*******************************************************************/
|
|
|
|
|
|
/**
|
|
* Part of the code sandwich, this gets called right before
|
|
* the tokenization process begins. The main reason for
|
|
* this call is to allow the delegate to do initialization.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param
|
|
* @return TRUE if it's ok to proceed
|
|
*/
|
|
PRBool nsParser::WillTokenize(PRBool aIsFinalChunk){
|
|
nsITokenizer* theTokenizer=0;
|
|
nsresult result=mParserContext->mDTD->GetTokenizer(theTokenizer);
|
|
if (theTokenizer) {
|
|
result = theTokenizer->WillTokenize(aIsFinalChunk,&mTokenAllocator);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* This is the primary control routine to consume tokens.
|
|
* It iteratively consumes tokens until an error occurs or
|
|
* you run out of data.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @return error code -- 0 if ok, non-zero if error.
|
|
*/
|
|
nsresult nsParser::Tokenize(PRBool aIsFinalChunk){
|
|
|
|
++mMajorIteration;
|
|
|
|
nsITokenizer* theTokenizer=0;
|
|
nsresult result=mParserContext->mDTD->GetTokenizer(theTokenizer);
|
|
|
|
if(theTokenizer){
|
|
PRBool flushTokens=PR_FALSE;
|
|
|
|
MOZ_TIMER_START(mTokenizeTime);
|
|
|
|
WillTokenize(aIsFinalChunk);
|
|
while(NS_SUCCEEDED(result)) {
|
|
mParserContext->mScanner->Mark();
|
|
++mMinorIteration;
|
|
result=theTokenizer->ConsumeToken(*mParserContext->mScanner,flushTokens);
|
|
if(NS_FAILED(result)) {
|
|
mParserContext->mScanner->RewindToMark();
|
|
if(kEOF==result){
|
|
break;
|
|
}
|
|
else if(NS_ERROR_HTMLPARSER_STOPPARSING==result) {
|
|
result=Terminate();
|
|
break;
|
|
}
|
|
}
|
|
else if(flushTokens && mObserversEnabled) {
|
|
// I added the extra test of mObserversEnabled to fix Bug# 23931.
|
|
// Flush tokens on seeing </SCRIPT> -- Ref: Bug# 22485 --
|
|
// Also remember to update the marked position.
|
|
mParserContext->mScanner->Mark();
|
|
break;
|
|
}
|
|
}
|
|
DidTokenize(aIsFinalChunk);
|
|
|
|
MOZ_TIMER_STOP(mTokenizeTime);
|
|
}
|
|
else{
|
|
result=mInternalState=NS_ERROR_HTMLPARSER_BADTOKENIZER;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* This is the tail-end of the code sandwich for the
|
|
* tokenization process. It gets called once tokenziation
|
|
* has completed for each phase.
|
|
*
|
|
* @update gess 01/04/99
|
|
* @param
|
|
* @return TRUE if all went well
|
|
*/
|
|
PRBool nsParser::DidTokenize(PRBool aIsFinalChunk){
|
|
PRBool result=PR_TRUE;
|
|
|
|
nsITokenizer* theTokenizer=0;
|
|
nsresult rv=mParserContext->mDTD->GetTokenizer(theTokenizer);
|
|
|
|
if (NS_SUCCEEDED(rv) && theTokenizer) {
|
|
result = theTokenizer->DidTokenize(aIsFinalChunk);
|
|
if(mTokenObserver) {
|
|
PRInt32 theCount=theTokenizer->GetCount();
|
|
PRInt32 theIndex;
|
|
for(theIndex=0;theIndex<theCount;theIndex++){
|
|
if((*mTokenObserver)(theTokenizer->GetTokenAt(theIndex))){
|
|
//add code here to pull unwanted tokens out of the stack...
|
|
}
|
|
}//for
|
|
}//if
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void nsParser::DebugDumpSource(nsOutputStream& aStream) {
|
|
PRInt32 theIndex=-1;
|
|
|
|
nsITokenizer* theTokenizer=0;
|
|
if(NS_SUCCEEDED(mParserContext->mDTD->GetTokenizer(theTokenizer))){
|
|
CToken* theToken;
|
|
while(nsnull != (theToken=theTokenizer->GetTokenAt(++theIndex))) {
|
|
// theToken->DebugDumpToken(out);
|
|
theToken->DebugDumpSource(aStream);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Call this to get a newly constructed tagstack
|
|
* @update gess 5/05/99
|
|
* @param aTagStack is an out parm that will contain your result
|
|
* @return NS_OK if successful, or NS_HTMLPARSER_MEMORY_ERROR on error
|
|
*/
|
|
nsresult nsParser::CreateTagStack(nsITagStack** aTagStack){
|
|
*aTagStack=new nsTagStack();
|
|
if(*aTagStack)
|
|
return NS_OK;
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
/**
|
|
* Get the DTD associated with this parser
|
|
* @update vidur 9/29/99
|
|
* @param aDTD out param that will contain the result
|
|
* @return NS_OK if successful, NS_ERROR_FAILURE for runtime error
|
|
*/
|
|
NS_IMETHODIMP
|
|
nsParser::GetDTD(nsIDTD** aDTD)
|
|
{
|
|
if (mParserContext) {
|
|
*aDTD = mParserContext->mDTD;
|
|
NS_IF_ADDREF(mParserContext->mDTD);
|
|
}
|
|
|
|
return NS_OK;
|
|
}
|
|
|
|
/**
|
|
* Get the observer service
|
|
*
|
|
* @update rickg 11/22/99
|
|
* @return ptr to server or NULL
|
|
*/
|
|
CObserverService* nsParser::GetObserverService(void) {
|
|
//XXX Hack! this should be XPCOM based!
|
|
if(mObserversEnabled)
|
|
return &mObserverService;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Store data into the bundle.
|
|
*
|
|
* @update harishd 05/10/00
|
|
* @param aData - The data to be stored.
|
|
* @return NS_OK if all went well else ERROR.
|
|
*/
|
|
NS_IMETHODIMP
|
|
nsParser::SetDataIntoBundle(const nsString& aKey,nsISupports* anObject) {
|
|
nsresult result=NS_OK;
|
|
if(!mBundle) {
|
|
mBundle = new nsParserBundle();
|
|
if(mBundle==nsnull) return NS_ERROR_OUT_OF_MEMORY;
|
|
NS_ADDREF(mBundle);
|
|
}
|
|
result=mBundle->SetDataIntoBundle(aKey,anObject);
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Retrieve data from the bundle by IID.
|
|
* NOTE: The object retireved should not be released
|
|
*
|
|
* @update harishd 05/10/00
|
|
* @param aIID - The ID to identify the correct object in the bundle
|
|
* @return Return object if found in bundle else return NULL.
|
|
*/
|
|
NS_IMETHODIMP
|
|
nsParser::GetDataFromBundle(const nsString& aKey,nsISupports** anObject) {
|
|
nsresult result=NS_OK;
|
|
result=mBundle->GetDataFromBundle(aKey,anObject);
|
|
return result;
|
|
}
|
|
|
|
|
|
NS_IMPL_ISUPPORTS1(nsParserBundle,
|
|
nsISupportsParserBundle
|
|
);
|
|
|
|
|
|
/**
|
|
* Release data from the Hash table
|
|
*
|
|
* @update harishd 05/10/00
|
|
*/
|
|
static PRBool PR_CALLBACK ReleaseData(nsHashKey* aKey, void* aData, void* aClosure) {
|
|
nsISupports* object = (nsISupports*)aData;
|
|
NS_RELEASE(object);
|
|
return PR_TRUE;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @update harishd 05/10/00
|
|
*/
|
|
nsParserBundle::nsParserBundle (){
|
|
NS_INIT_REFCNT();
|
|
mData=new nsHashtable(5);
|
|
}
|
|
|
|
/**
|
|
* Release objects from the bundle.
|
|
*
|
|
* @update harishd 05/10/00
|
|
*/
|
|
nsParserBundle::~nsParserBundle () {
|
|
mData->Enumerate(ReleaseData);
|
|
delete mData;
|
|
}
|
|
|
|
/**
|
|
* Store data into the bundle.
|
|
*
|
|
* @update harishd 05/10/00
|
|
* @param aData - The data to be stored.
|
|
* @return NS_OK if all went well else ERROR.
|
|
*/
|
|
NS_IMETHODIMP
|
|
nsParserBundle::SetDataIntoBundle(const nsString& aKey,nsISupports* anObject) {
|
|
nsresult result=NS_OK;
|
|
if(anObject) {
|
|
nsStringKey key(aKey);
|
|
PRBool found=mData->Exists(&key);
|
|
if(!found) {
|
|
NS_ADDREF(anObject);
|
|
mData->Put(&key,anObject);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Retrieve data from the bundle by IID.
|
|
* NOTE: The object retrieved should not be released.
|
|
*
|
|
* @update harishd 05/10/00
|
|
* @param aIID - The ID to identify the correct object in the bundle
|
|
* @return Return object if found in bundle else return NULL.
|
|
*/
|
|
NS_IMETHODIMP
|
|
nsParserBundle::GetDataFromBundle(const nsString& aKey,nsISupports** anObject) {
|
|
nsresult result=NS_OK;
|
|
|
|
nsStringKey key(aKey);
|
|
*anObject=(mData)? (nsISupports*)mData->Get(&key):nsnull;
|
|
|
|
if(*anObject) {
|
|
NS_ADDREF(*anObject);
|
|
}
|
|
else{
|
|
result=NS_ERROR_NULL_POINTER;
|
|
}
|
|
|
|
return result;
|
|
}
|