Mozilla/mozilla/htmlparser/src/nsExpatTokenizer.cpp
harishd%netscape.com f8e87704a2 Propagating NS_ERROR_HTMLPARSER_STOPPARSING on XML error.
git-svn-id: svn://10.0.0.236/trunk@34681 18797224-902f-48f8-a5cc-f745e15eee43
1999-06-10 23:23:50 +00:00

460 lines
16 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
* The contents of this file are subject to the Netscape Public License
* Version 1.0 (the "NPL"); you may not use this file except in
* compliance with the NPL. You may obtain a copy of the NPL at
* http://www.mozilla.org/NPL/
*
* Software distributed under the NPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
* for the specific language governing rights and limitations under the
* NPL.
*
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
*/
/**
* MODULE NOTES:
* @update gess 4/1/98
*
*/
#include "nsExpatTokenizer.h"
#include "nsScanner.h"
#include "nsDTDUtils.h"
#include "nsParserError.h"
#include "nsIParser.h"
#include "prlog.h"
/************************************************************************
And now for the main class -- nsExpatTokenizer...
************************************************************************/
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
static NS_DEFINE_IID(kITokenizerIID, NS_ITOKENIZER_IID);
static NS_DEFINE_IID(kHTMLTokenizerIID, NS_HTMLTOKENIZER_IID);
static NS_DEFINE_IID(kClassIID, NS_EXPATTOKENIZER_IID);
static CTokenRecycler* gTokenRecycler=0;
static nsDeque* gTokenDeque=0;
static XML_Parser gExpatParser=0;
/**
* This method gets called as part of our COM-like interfaces.
* Its purpose is to create an interface to parser object
* of some type.
*
* @update gess 4/8/98
* @param nsIID id of object to discover
* @param aInstancePtr ptr to newly discovered interface
* @return NS_xxx result code
*/
nsresult nsExpatTokenizer::QueryInterface(const nsIID& aIID, void** aInstancePtr)
{
if (NULL == aInstancePtr) {
return NS_ERROR_NULL_POINTER;
}
if(aIID.Equals(kISupportsIID)) { //do IUnknown...
*aInstancePtr = (nsExpatTokenizer*)(this);
}
else if(aIID.Equals(kITokenizerIID)) { //do ITokenizer base class...
*aInstancePtr = (nsITokenizer*)(this);
}
else if(aIID.Equals(kHTMLTokenizerIID)) { //do nsHTMLTokenizer base class...
*aInstancePtr = (nsHTMLTokenizer*)(this);
}
else if(aIID.Equals(kClassIID)) { //do this class...
*aInstancePtr = (nsExpatTokenizer*)(this);
}
else {
*aInstancePtr=0;
return NS_NOINTERFACE;
}
NS_ADDREF_THIS();
return NS_OK;
}
/**
* This method is defined in nsIParser. It is used to
* cause the COM-like construction of an nsParser.
*
* @update gess 4/8/98
* @param nsIParser** ptr to newly instantiated parser
* @return NS_xxx error result
*/
NS_HTMLPARS nsresult NS_New_Expat_Tokenizer(nsIDTD** aInstancePtrResult) {
nsExpatTokenizer* it = new nsExpatTokenizer();
if (it == 0) {
return NS_ERROR_OUT_OF_MEMORY;
}
return it->QueryInterface(kClassIID, (void **) aInstancePtrResult);
}
NS_IMPL_ADDREF(nsExpatTokenizer)
NS_IMPL_RELEASE(nsExpatTokenizer)
/**
* Sets up the callbacks for the expat parser
* @update nra 2/24/99
* @param none
* @return none
*/
void nsExpatTokenizer::SetupExpatCallbacks(void) {
if (mExpatParser) {
XML_SetElementHandler(mExpatParser, HandleStartElement, HandleEndElement);
XML_SetCharacterDataHandler(mExpatParser, HandleCharacterData);
XML_SetProcessingInstructionHandler(mExpatParser, HandleProcessingInstruction);
XML_SetDefaultHandler(mExpatParser, nsnull);
XML_SetUnparsedEntityDeclHandler(mExpatParser, HandleUnparsedEntityDecl);
XML_SetNotationDeclHandler(mExpatParser, HandleNotationDecl);
XML_SetExternalEntityRefHandler(mExpatParser, HandleExternalEntityRef);
XML_SetUnknownEncodingHandler(mExpatParser, HandleUnknownEncoding, NULL);
}
}
/**
* Default constructor
*
* @update gess 4/9/98
* @param
* @return
*/
nsExpatTokenizer::nsExpatTokenizer() : nsHTMLTokenizer() {
NS_INIT_REFCNT();
mBytesParsed = 0;
mSeenError = PR_FALSE;
nsAutoString buffer("UTF-16");
const PRUnichar* encoding = buffer.GetUnicode();
if (encoding) {
mExpatParser = XML_ParserCreate((const XML_Char*) encoding);
gTokenRecycler=(CTokenRecycler*)GetTokenRecycler();
if (mExpatParser) {
SetupExpatCallbacks();
}
}
}
/**
* Destructor
*
* @update gess 4/9/98
* @param
* @return
*/
nsExpatTokenizer::~nsExpatTokenizer(){
if (mExpatParser) {
XML_ParserFree(mExpatParser);
mExpatParser = nsnull;
}
}
/*******************************************************************
Here begins the real working methods for the tokenizer.
*******************************************************************/
void nsExpatTokenizer::SetErrorContextInfo(nsParserError* aError, PRUint32 aByteIndex,
const char* aSourceBuffer, PRUint32 aLength)
{
/* Figure out the substring inside aSourceBuffer that contains the line on which the error
occurred. Copy the line into aError->sourceLine */
PR_ASSERT(aByteIndex > 0 && aByteIndex < aLength);
/* Assert that the byteIndex and the length of the buffer is even */
PR_ASSERT(aByteIndex % 2 == 0 && aLength % 2 == 0);
PRUnichar* start = (PRUnichar* ) &aSourceBuffer[aByteIndex]; /* Will try to find the start of the line */
PRUnichar* end = (PRUnichar* ) &aSourceBuffer[aByteIndex]; /* Will try to find the end of the line */
PRUint32 startIndex = aByteIndex / 2; /* Track the position of the 'start' pointer into the buffer */
PRUint32 endIndex = aByteIndex / 2; /* Track the position of the 'end' pointer into the buffer */
PRUint32 numCharsInBuffer = aLength / 2;
PRBool reachedStart;
PRBool reachedEnd;
/* Use start to find the first new line before the error position and
end to find the first new line after the error position */
reachedStart = ('\n' == *start || '\r' == *start || startIndex <= 0);
reachedEnd = ('\n' == *end || '\r' == *end || endIndex >= numCharsInBuffer);
while (!reachedStart || !reachedEnd) {
if (!reachedStart) {
start--;
startIndex--;
reachedStart = ('\n' == *start || '\r' == *start || startIndex <= 0);
}
if (!reachedEnd) {
end++;
endIndex++;
reachedEnd = ('\n' == *end || '\r' == *end || endIndex >= numCharsInBuffer);
}
}
if (startIndex == endIndex) {
/* Special case if the error is on a line where the only character is a newline */
aError->sourceLine.Append("");
}
else {
PR_ASSERT(endIndex - startIndex >= 2);
/* At this point, there are two cases. Either the error is on the first line or
on subsequent lines. If the error is on the first line, startIndex will decrement
all the way to zero. If not, startIndex will decrement to the position of the
newline character on the previous line. So, in the first case, the start position
of the error line = startIndex (== 0). In the second case, the start position of the
error line = startIndex + 1. In both cases, the end position of the error line will be
(endIndex - 1). */
PRUint32 startPosn = (startIndex <= 0) ? startIndex : startIndex + 1;
/* At this point, the substring starting at startPosn and ending at (endIndex - 1),
is the line on which the error occurred. Copy that substring into the error structure. */
const PRUnichar* unicodeBuffer = (const PRUnichar*) aSourceBuffer;
aError->sourceLine.Append(&unicodeBuffer[startPosn], endIndex - startPosn);
}
}
/*
* Called immediately after an error has occurred in expat. Creates
* an error token and pushes it onto the token queue.
*
*/
void nsExpatTokenizer::PushXMLErrorToken(const char *aBuffer, PRUint32 aLength)
{
CErrorToken* token= (CErrorToken *) gTokenRecycler->CreateTokenOfType(eToken_error, eHTMLTag_unknown);
nsParserError *error = new nsParserError;
PRUint32 byteIndexRelativeToFile = 0;
if(error){
error->code = XML_GetErrorCode(mExpatParser);
error->lineNumber = XML_GetCurrentLineNumber(mExpatParser);
error->colNumber = XML_GetCurrentColumnNumber(mExpatParser);
error->description = XML_ErrorString(error->code);
byteIndexRelativeToFile = XML_GetCurrentByteIndex(mExpatParser);
SetErrorContextInfo(error, (byteIndexRelativeToFile - mBytesParsed), aBuffer, aLength);
token->SetError(error);
CToken* theToken = (CToken* )token;
AddToken(theToken, NS_OK, *gTokenDeque,gTokenRecycler);
}
}
nsresult nsExpatTokenizer::ParseXMLBuffer(const char* aBuffer, PRUint32 aLength){
nsresult result=NS_OK;
if (mExpatParser) {
if (!XML_Parse(mExpatParser, aBuffer, aLength, PR_FALSE)) {
PushXMLErrorToken(aBuffer, aLength);
result=NS_ERROR_HTMLPARSER_STOPPARSING;
}
mBytesParsed += aLength;
}
else {
result = NS_ERROR_FAILURE;
}
return result;
}
/**
* This method repeatedly called by the tokenizer.
* Each time, we determine the kind of token were about to
* read, and then we call the appropriate method to handle
* that token type.
*
* @update gess 3/25/98
* @param aChar: last char read
* @param aScanner: see nsScanner.h
* @param anErrorCode: arg that will hold error condition
* @return new token or null
*/
nsresult nsExpatTokenizer::ConsumeToken(nsScanner& aScanner) {
// return nsHTMLTokenizer::ConsumeToken(aScanner);
// Ask the scanner to send us all the data it has
// scanned and pass that data to expat.
nsresult result = NS_OK;
nsString& theBuffer = aScanner.GetBuffer();
PRInt32 length = theBuffer.Length();
if(0 < length) {
const PRUnichar* expatBuffer = theBuffer.GetUnicode();
PRUint32 bufLength = theBuffer.Length() * 2;
if (expatBuffer) {
gTokenDeque=&mTokenDeque;
gExpatParser = mExpatParser;
result = ParseXMLBuffer((const char *)expatBuffer, bufLength);
}
theBuffer.Truncate(0);
}
if(NS_OK==result)
result=aScanner.Eof();
return result;
}
/**
*
* @update gess12/29/98
* @param
* @return
*/
void nsExpatTokenizer::FrontloadMisplacedContent(nsDeque& aDeque){
}
/***************************************/
/* Expat Callback Functions start here */
/***************************************/
void nsExpatTokenizer::HandleStartElement(void *userData, const XML_Char *name, const XML_Char **atts){
CToken* theToken=gTokenRecycler->CreateTokenOfType(eToken_start,eHTMLTag_unknown);
if(theToken) {
nsString& theString=theToken->GetStringValueXXX();
theString.SetString((PRUnichar *) name);
AddToken(theToken,NS_OK,*gTokenDeque,gTokenRecycler);
int theAttrCount=0;
while(*atts){
theAttrCount++;
CAttributeToken* theAttrToken= (CAttributeToken*)gTokenRecycler->CreateTokenOfType(eToken_attribute,eHTMLTag_unknown);
if(theAttrToken){
nsString& theKey=theAttrToken->GetKey();
theKey.SetString((PRUnichar *) (*atts++));
nsString& theValue=theAttrToken->GetStringValueXXX();
theValue.SetString((PRUnichar *) (*atts++));
}
CToken* theTok=(CToken*)theAttrToken;
AddToken(theTok,NS_OK,*gTokenDeque,gTokenRecycler);
}
theToken->SetAttributeCount(theAttrCount);
}
else{
//THROW A HUGE ERROR IF WE CANT CREATE A TOKEN!
}
}
void nsExpatTokenizer::HandleEndElement(void *userData, const XML_Char *name) {
CToken* theToken=gTokenRecycler->CreateTokenOfType(eToken_end,eHTMLTag_unknown);
if(theToken) {
nsString& theString=theToken->GetStringValueXXX();
theString.SetString((PRUnichar *) name);
AddToken(theToken,NS_OK,*gTokenDeque,gTokenRecycler);
}
else{
//THROW A HUGE ERROR IF WE CANT CREATE A TOKEN!
}
}
void nsExpatTokenizer::HandleCharacterData(void *userData, const XML_Char *s, int len) {
CCDATASectionToken* currentCDataToken = (CCDATASectionToken*) userData;
PRBool StartOfCDataSection = (!currentCDataToken && len == 0);
PRBool EndOfCDataSection = (currentCDataToken && len == 0);
// Either create a new token (if not currently within a CDATA section) or add the
// current string from expat to the current CDATA token.
if (StartOfCDataSection) {
// Set up state so that we know that we are within a CDATA section.
currentCDataToken = (CCDATASectionToken*) gTokenRecycler->CreateTokenOfType(eToken_cdatasection,eHTMLTag_unknown);
XML_SetUserData(gExpatParser, (void *) currentCDataToken);
}
else if (EndOfCDataSection) {
// We've reached the end of the current CDATA section. Push the current CDATA token
// onto the token queue and reset state to being outside a CDATA section.
CToken* tempCDATAToken = (CToken*) currentCDataToken;
AddToken(tempCDATAToken,NS_OK,*gTokenDeque,gTokenRecycler);
currentCDataToken = 0;
XML_SetUserData(gExpatParser, 0);
}
else if (currentCDataToken) {
// While there exists a current CDATA token, keep appending all strings from expat into it.
nsString& theString = currentCDataToken->GetStringValueXXX();
theString.Append((PRUnichar *) s,len);
}
else {
CToken* newToken = 0;
switch(s[0]){
case kNewLine:
case CR:
newToken=gTokenRecycler->CreateTokenOfType(eToken_newline,eHTMLTag_unknown); break;
case kSpace:
case kTab:
newToken=gTokenRecycler->CreateTokenOfType(eToken_whitespace,eHTMLTag_unknown); break;
default:
newToken=gTokenRecycler->CreateTokenOfType(eToken_text,eHTMLTag_unknown);
}
if(newToken) {
if ((s[0] != kNewLine) && (s[0] != CR)) {
nsString& theString=newToken->GetStringValueXXX();
theString.Append((PRUnichar *) s,len);
}
AddToken(newToken,NS_OK,*gTokenDeque,gTokenRecycler);
}
else {
//THROW A HUGE ERROR IF WE CANT CREATE A TOKEN!
}
}
}
void nsExpatTokenizer::HandleProcessingInstruction(void *userData, const XML_Char *target, const XML_Char *data){
CToken* theToken=gTokenRecycler->CreateTokenOfType(eToken_instruction,eHTMLTag_unknown);
if(theToken) {
nsString& theString=theToken->GetStringValueXXX();
theString.Append("<?");
theString.Append((PRUnichar *) target);
if(data) {
theString.Append(" ");
theString.Append((PRUnichar *) data);
}
theString.Append("?>");
AddToken(theToken,NS_OK,*gTokenDeque,gTokenRecycler);
}
else{
//THROW A HUGE ERROR IF WE CANT CREATE A TOKEN!
}
}
void nsExpatTokenizer::HandleDefault(void *userData, const XML_Char *s, int len) {
// NS_NOTYETIMPLEMENTED("Error: nsExpatTokenizer::HandleDefault() not yet implemented.");
}
void nsExpatTokenizer::HandleUnparsedEntityDecl(void *userData,
const XML_Char *entityName,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId,
const XML_Char *notationName) {
NS_NOTYETIMPLEMENTED("Error: nsExpatTokenizer::HandleUnparsedEntityDecl() not yet implemented.");
}
void nsExpatTokenizer::HandleNotationDecl(void *userData,
const XML_Char *notationName,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId){
NS_NOTYETIMPLEMENTED("Error: nsExpatTokenizer::HandleNotationDecl() not yet implemented.");
}
int nsExpatTokenizer::HandleExternalEntityRef(XML_Parser parser,
const XML_Char *openEntityNames,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId){
NS_NOTYETIMPLEMENTED("Error: nsExpatTokenizer::HandleExternalEntityRef() not yet implemented.");
int result=0;
return result;
}
int nsExpatTokenizer::HandleUnknownEncoding(void *encodingHandlerData,
const XML_Char *name,
XML_Encoding *info) {
NS_NOTYETIMPLEMENTED("Error: nsExpatTokenizer::HandleUnknownEncoding() not yet implemented.");
int result=0;
return result;
}