Mozilla/mozilla/parser/htmlparser/src/CParserContext.cpp
rickg%netscape.com 0d6131fe26 enabling Strict DTD; r=harishd.
git-svn-id: svn://10.0.0.236/trunk@66897 18797224-902f-48f8-a5cc-f745e15eee43
2000-04-23 03:28:18 +00:00

328 lines
11 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
#include "CParserContext.h"
#include "nsToken.h"
#include "prenv.h"
MOZ_DECL_CTOR_COUNTER(CParserContext);
/**
* Your friendly little constructor. Ok, it's not the friendly, but the only guy
* using it is the parser.
* @update gess7/23/98
* @param aScanner
* @param aKey
* @param aListener
*/
CParserContext::CParserContext(nsScanner* aScanner,
void *aKey,
eParserCommands aCommand,
nsIStreamObserver* aListener,
nsIDTD *aDTD,
eAutoDetectResult aStatus,
PRBool aCopyUnused)
{
MOZ_COUNT_CTOR(CParserContext);
mScanner=aScanner;
mKey=aKey;
mPrevContext=0;
mListener=aListener;
NS_IF_ADDREF(mListener);
mParseMode=eParseMode_unknown;
mAutoDetectStatus=aStatus;
mTransferBuffer=0;
mDTD=aDTD;
NS_IF_ADDREF(mDTD);
mTransferBufferSize=eTransferBufferSize;
mParserEnabled=PR_TRUE;
mStreamListenerState=eNone;
mMultipart=PR_TRUE;
mContextType=eCTNone;
mCopyUnused=aCopyUnused;
mParserCommand=aCommand;
mChannel=0;
}
/**
* Your friendly little constructor. Ok, it's not the friendly, but the only guy
* using it is the parser.
* @update gess7/23/98
* @param aScanner
* @param aKey
* @param aListener
*/
CParserContext::CParserContext(const CParserContext &aContext) : mMimeType() {
MOZ_COUNT_CTOR(CParserContext);
mScanner=aContext.mScanner;
mKey=aContext.mKey;
mPrevContext=0;
mListener=aContext.mListener;
NS_IF_ADDREF(mListener);
mParseMode=aContext.mParseMode;
mAutoDetectStatus=aContext.mAutoDetectStatus;
mTransferBuffer=aContext.mTransferBuffer;
mDTD=aContext.mDTD;
NS_IF_ADDREF(mDTD);
mTransferBufferSize=eTransferBufferSize;
mParserEnabled=aContext.mParserEnabled;
mStreamListenerState=aContext.mStreamListenerState;
mMultipart=aContext.mMultipart;
mContextType=aContext.mContextType;
mChannel=aContext.mChannel;
mParserCommand=aContext.mParserCommand;
SetMimeType(aContext.mMimeType);
}
/**
* Destructor for parser context
* NOTE: DO NOT destroy the dtd here.
* @update gess7/11/98
*/
CParserContext::~CParserContext(){
MOZ_COUNT_DTOR(CParserContext);
if(mScanner)
delete mScanner;
if(mTransferBuffer)
delete [] mTransferBuffer;
NS_IF_RELEASE(mDTD);
//Remember that it's ok to simply ingore the PrevContext.
}
/**
* Set's the mimetype for this context
* @update rickg 03.18.2000
*/
void CParserContext::SetMimeType(const nsString& aMimeType){
mMimeType.Assign(aMimeType);
mDocType=ePlainText;
if(mMimeType.EqualsWithConversion(kHTMLTextContentType))
mDocType=eHTML4Text;
else if(mMimeType.EqualsWithConversion(kXMLTextContentType))
mDocType=eXMLText;
else if(mMimeType.EqualsWithConversion(kXULTextContentType))
mDocType=eXMLText;
else if(mMimeType.EqualsWithConversion(kRDFTextContentType))
mDocType=eXMLText;
else if(mMimeType.EqualsWithConversion(kXIFTextContentType))
mDocType=eXMLText;
}
/*************************************************************************************************
First, let's define our modalities:
1. compatibility-mode: behave as much like nav4 as possible (unless it's too broken to bother)
2. standard-mode: do html as well as you can per spec, and throw out navigator quirks
3. strict-mode: adhere to the strict DTD specificiation to the highest degree possible
Assume the doctype is in the following form:
<!DOCTYPE [Top Level Element] [Availability] "[Registration]// [Owner-ID] // [Type] [desc-text] // [Language]" "URI|text-identifier">
[HTML] [PUBLIC|...] [+|-] [W3C|IETF|...] [DTD] "..." [EN]|...] "..."
Here are the new rules for DTD handling; comments welcome:
XHTML and XML documents are always strict-mode:
example: <!DOCTYPE \"-//W3C//DTD XHTML 1.0 Strict//EN\">
HTML strict dtd's enable strict-mode:
example: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
example: <!DOCTYPE \"ISO/IEC 15445:1999//DTD HTML//EN\">
HTML 4.0 (or greater) transitional, frameset, (etc), without URI enables compatibility-mode:
example: <!DOCTYPE \"-//W3C//DTD HTML 4.01 Transitional//EN\">
HTML 4.0 (or greater) transitional, frameset, (etc), with a URI that points to the strict.dtd will become strict:
example: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/strict.dtd">
doctypes with systemID's or internal subset are handled in strict-mode:
example: <!DOCTYPE HTML PUBLIC PublicID SystemID>
example: <!DOCTYPE HTML SYSTEM SystemID>
example: <!DOCTYPE HTML (PUBLIC PublicID SystemID? | SYSTEM SystemID) [ Internal-SS ]>
All other doctypes (<4.0), and documents without a doctype are handled in compatibility-mode.
*****************************************************************************************************/
/**
* This is called when it's time to find out
* what mode the parser/DTD should run for this document.
* (Each parsercontext can have it's own mode).
*
* @update gess 02/17/00
* @return parsermode (define in nsIParser.h)
*/
eParseMode CParserContext::DetermineParseMode(const nsString& theBuffer) {
const char* theModeStr= PR_GetEnv("PARSE_MODE");
mParseMode = eParseMode_unknown;
PRInt32 theIndex=theBuffer.Find("DOCTYPE",PR_TRUE,0,10);
if(kNotFound<theIndex) {
//good, we found "DOCTYPE" -- now go find it's end delimiter '>'
PRInt32 theGTPos=theBuffer.FindChar(kGreaterThan,theIndex+1);
PRInt32 theEnd=(kNotFound==theGTPos) ? 512 : MinInt(512,theGTPos);
PRInt32 theSubIndex=theBuffer.Find("//DTD",PR_TRUE,theIndex+8,theEnd-(theIndex+8)); //skip to the type and desc-text...
PRInt32 theErr=0;
PRInt32 theMajorVersion=3;
//note that if we don't find '>', then we just scan the first 512 bytes.
if(0<=theSubIndex) {
PRInt32 theStartPos=theSubIndex+5;
PRInt32 theCount=theEnd-theStartPos;
if(kNotFound<theSubIndex) {
theSubIndex=theBuffer.Find("XHTML",PR_TRUE,theStartPos,theCount);
if(0<=theSubIndex) {
mDocType=eXHTMLText;
mParseMode=eParseMode_strict;
return mParseMode;
}
else {
theSubIndex=theBuffer.Find("ISO/IEC 15445:",PR_TRUE,theIndex+8,theEnd-(theIndex+8));
if(0<=theSubIndex) {
mDocType=eHTML4Text;
mParseMode=eParseMode_strict;
theMajorVersion=4;
theSubIndex+=15;
}
else {
theSubIndex=theBuffer.Find("HTML",PR_TRUE,theStartPos,theCount);
if(0<=theSubIndex) {
mDocType=eHTML4Text;
mParseMode=eParseMode_strict;
theMajorVersion=3;
}
else {
theSubIndex=theBuffer.Find("HYPERTEXT MARKUP",PR_TRUE,theStartPos,theCount);
if(0<=theSubIndex) {
mDocType=eHTML3Text;
mParseMode=eParseMode_quirks;
theSubIndex+=20;
}
}
}
}
}
theStartPos=theSubIndex+5;
theCount=theEnd-theStartPos;
nsAutoString theNum;
//get the next substring from the buffer, which should be a number.
//now see what the version number is...
theStartPos=theBuffer.FindCharInSet("123456789",theStartPos);
if(0<=theStartPos) {
PRInt32 theTerminal=theBuffer.FindCharInSet(" />",theStartPos+1);
if(theTerminal) {
theBuffer.Mid(theNum,theStartPos,theTerminal-theStartPos);
}
else theBuffer.Mid(theNum,theStartPos,3);
theMajorVersion=theNum.ToInteger(&theErr);
}
//now see what the
theStartPos+=theNum.Length();
theCount=theEnd-theStartPos;
if((theBuffer.Find("TRANSITIONAL",PR_TRUE,theStartPos,theCount)>kNotFound)||
(theBuffer.Find("LOOSE",PR_TRUE,theStartPos,theCount)>kNotFound) ||
(theBuffer.Find("FRAMESET",PR_TRUE,theStartPos,theCount)>kNotFound) ||
(theBuffer.Find("LATIN1", PR_TRUE,theStartPos,theCount) >kNotFound) ||
(theBuffer.Find("SYMBOLS",PR_TRUE,theStartPos,theCount) >kNotFound) ||
(theBuffer.Find("SPECIAL",PR_TRUE,theStartPos,theCount) >kNotFound)) {
mParseMode=eParseMode_quirks;
}
//one last thing: look for a URI that specifies the strict.dtd
theStartPos+=6;
theCount=theEnd-theStartPos;
theSubIndex=theBuffer.Find("STRICT.DTD",PR_TRUE,theStartPos,theCount);
if(0<theSubIndex) {
//Since we found it, regardless of what's in the descr-text, kick into strict mode.
mParseMode=eParseMode_strict;
mDocType=eHTML4Text;
}
if (0==theErr){
switch(theMajorVersion) {
case 0: case 1: case 2: case 3:
if(mDocType!=eXHTMLText){
mParseMode=eParseMode_quirks; //be as backward compatible as possible
mDocType=eHTML3Text;
}
break;
default:
//XXX hack -- someday, the next line of code will be criticized
//for it's lack of vision...
if(theMajorVersion>20) {
mParseMode=eParseMode_noquirks;
}
break;
} //switch
}
} //if
else {
PRInt32 thePos=theBuffer.Find("HTML",PR_TRUE,1,50);
if(kNotFound!=thePos) {
mDocType=eHTML4Text;
PRInt32 theIDPos=theBuffer.Find("PublicID",thePos);
if(kNotFound==theIDPos)
theIDPos=theBuffer.Find("SystemID",thePos);
mParseMode=(kNotFound==theIDPos) ? eParseMode_quirks : eParseMode_strict;
}
}
}
else if(kNotFound<(theIndex=theBuffer.Find("?XML",PR_TRUE,0,128))) {
mParseMode=eParseMode_strict;
}
if(theModeStr) {
if(0==nsCRT::strcasecmp(theModeStr,"strict"))
mParseMode=eParseMode_strict;
}
else mParseMode = (eParseMode_unknown==mParseMode)? eParseMode_quirks : mParseMode;
return mParseMode;
}