diff --git a/mozilla/htmlparser/src/nsParser.cpp b/mozilla/htmlparser/src/nsParser.cpp
index 15d063832d6..3c1cd105a04 100644
--- a/mozilla/htmlparser/src/nsParser.cpp
+++ b/mozilla/htmlparser/src/nsParser.cpp
@@ -107,6 +107,67 @@ public:
CSharedParserObjects gSharedParserObjects;
+//----------------------------------------
+
+#define NOT_USED 0xfffd
+
+static PRUint16 PA_HackTable[] = {
+ NOT_USED,
+ NOT_USED,
+ 0x201a, /* SINGLE LOW-9 QUOTATION MARK */
+ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
+ 0x201e, /* DOUBLE LOW-9 QUOTATION MARK */
+ 0x2026, /* HORIZONTAL ELLIPSIS */
+ 0x2020, /* DAGGER */
+ 0x2021, /* DOUBLE DAGGER */
+ 0x02c6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
+ 0x2030, /* PER MILLE SIGN */
+ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
+ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
+ 0x0152, /* LATIN CAPITAL LIGATURE OE */
+ NOT_USED,
+ NOT_USED,
+ NOT_USED,
+
+ NOT_USED,
+ 0x2018, /* LEFT SINGLE QUOTATION MARK */
+ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
+ 0x201c, /* LEFT DOUBLE QUOTATION MARK */
+ 0x201d, /* RIGHT DOUBLE QUOTATION MARK */
+ 0x2022, /* BULLET */
+ 0x2013, /* EN DASH */
+ 0x2014, /* EM DASH */
+ 0x02dc, /* SMALL TILDE */
+ 0x2122, /* TRADE MARK SIGN */
+ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
+ 0x203a, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
+ 0x0153, /* LATIN SMALL LIGATURE OE */
+ NOT_USED,
+ NOT_USED,
+ 0x0178 /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
+};
+
+static PRUnichar gToUCS2[256];
+
+static void
+MakeConversionTable()
+{
+ static PRBool firstTime = PR_TRUE;
+ if (firstTime) {
+ firstTime = PR_FALSE;
+ PRUnichar* cp = gToUCS2;
+ PRInt32 i;
+ for (i = 0; i < 256; i++) {
+ *cp++ = PRUnichar(i);
+ }
+ cp = gToUCS2;
+ for (i = 0; i < 32; i++) {
+ cp[0x80 + i] = PA_HackTable[i];
+ }
+ }
+}
+
+//----------------------------------------
/**
* default constructor
@@ -122,6 +183,7 @@ nsParser::nsParser() {
mSink=0;
mParserContext=0;
mDTDVerification=PR_FALSE;
+ MakeConversionTable();
}
@@ -775,13 +837,31 @@ nsresult nsParser::OnDataAvailable(nsIURL* aURL, nsIInputStream *pIStream, PRInt
mParserContext->mTransferBuffer = new char[CParserContext::eTransferBufferSize+1];
while (len > 0) {
- nsresult rv = pIStream->Read(mParserContext->mTransferBuffer, 0, mParserContext->eTransferBufferSize, &len);
+ nsresult rv = pIStream->Read(mParserContext->mTransferBuffer, 0,
+ mParserContext->eTransferBufferSize, &len);
if((rv == NS_OK) && (len>0)) {
-
if(mParserFilter)
mParserFilter->RawBuffer(mParserContext->mTransferBuffer, &len);
- mParserContext->mScanner->Append(mParserContext->mTransferBuffer,len);
+ // XXX kipp was here: this is a temporary piece of code that
+ // fixes up the data in the transfer buffer so that the 8 bit
+ // ascii is mapped to ucs2 properly. The problem is that for the
+ // default character set, some web pages use illegal codes (0x80
+ // to 0x9f, inclusive); we already have code to map entities
+ // properly in this range. This code maps raw stream data the
+ // same way.
+ PRUnichar buf[CParserContext::eTransferBufferSize];
+ PRUnichar* dst = buf;
+ const PRUnichar* table = gToUCS2;
+ const char* src = mParserContext->mTransferBuffer;
+ const char* end = src + len;
+ while (src < end) {
+ unsigned char ch = *(unsigned char*)src;
+ *dst++ = table[ch];
+ src++;
+ }
+
+ mParserContext->mScanner->Append(buf, len);
if(eUnknownDetect==mParserContext->mAutoDetectStatus) {
if(eValidDetect==AutoDetectContentType(mParserContext->mScanner->GetBuffer(),mParserContext->mSourceType)) {
diff --git a/mozilla/htmlparser/src/nsScanner.cpp b/mozilla/htmlparser/src/nsScanner.cpp
index d177926054d..b3fc55b97b8 100644
--- a/mozilla/htmlparser/src/nsScanner.cpp
+++ b/mozilla/htmlparser/src/nsScanner.cpp
@@ -179,6 +179,12 @@ PRBool CScanner::Append(const char* aBuffer, PRInt32 aLen){
return PR_TRUE;
}
+PRBool CScanner::Append(const PRUnichar* aBuffer, PRInt32 aLen){
+ mBuffer.Append(aBuffer,aLen);
+ mTotalRead+=aLen;
+ return PR_TRUE;
+}
+
/**
* Grab data from underlying stream.
*
diff --git a/mozilla/htmlparser/src/nsScanner.h b/mozilla/htmlparser/src/nsScanner.h
index b1196da698b..5950f5122a1 100644
--- a/mozilla/htmlparser/src/nsScanner.h
+++ b/mozilla/htmlparser/src/nsScanner.h
@@ -232,6 +232,8 @@ class CScanner {
*/
PRBool Append(const char* aBuffer, PRInt32 aLen);
+ PRBool Append(const PRUnichar* aBuffer, PRInt32 aLen);
+
/**
*
*
diff --git a/mozilla/parser/htmlparser/src/nsParser.cpp b/mozilla/parser/htmlparser/src/nsParser.cpp
index 15d063832d6..3c1cd105a04 100644
--- a/mozilla/parser/htmlparser/src/nsParser.cpp
+++ b/mozilla/parser/htmlparser/src/nsParser.cpp
@@ -107,6 +107,67 @@ public:
CSharedParserObjects gSharedParserObjects;
+//----------------------------------------
+
+#define NOT_USED 0xfffd
+
+static PRUint16 PA_HackTable[] = {
+ NOT_USED,
+ NOT_USED,
+ 0x201a, /* SINGLE LOW-9 QUOTATION MARK */
+ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
+ 0x201e, /* DOUBLE LOW-9 QUOTATION MARK */
+ 0x2026, /* HORIZONTAL ELLIPSIS */
+ 0x2020, /* DAGGER */
+ 0x2021, /* DOUBLE DAGGER */
+ 0x02c6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
+ 0x2030, /* PER MILLE SIGN */
+ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
+ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
+ 0x0152, /* LATIN CAPITAL LIGATURE OE */
+ NOT_USED,
+ NOT_USED,
+ NOT_USED,
+
+ NOT_USED,
+ 0x2018, /* LEFT SINGLE QUOTATION MARK */
+ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
+ 0x201c, /* LEFT DOUBLE QUOTATION MARK */
+ 0x201d, /* RIGHT DOUBLE QUOTATION MARK */
+ 0x2022, /* BULLET */
+ 0x2013, /* EN DASH */
+ 0x2014, /* EM DASH */
+ 0x02dc, /* SMALL TILDE */
+ 0x2122, /* TRADE MARK SIGN */
+ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
+ 0x203a, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
+ 0x0153, /* LATIN SMALL LIGATURE OE */
+ NOT_USED,
+ NOT_USED,
+ 0x0178 /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
+};
+
+static PRUnichar gToUCS2[256];
+
+static void
+MakeConversionTable()
+{
+ static PRBool firstTime = PR_TRUE;
+ if (firstTime) {
+ firstTime = PR_FALSE;
+ PRUnichar* cp = gToUCS2;
+ PRInt32 i;
+ for (i = 0; i < 256; i++) {
+ *cp++ = PRUnichar(i);
+ }
+ cp = gToUCS2;
+ for (i = 0; i < 32; i++) {
+ cp[0x80 + i] = PA_HackTable[i];
+ }
+ }
+}
+
+//----------------------------------------
/**
* default constructor
@@ -122,6 +183,7 @@ nsParser::nsParser() {
mSink=0;
mParserContext=0;
mDTDVerification=PR_FALSE;
+ MakeConversionTable();
}
@@ -775,13 +837,31 @@ nsresult nsParser::OnDataAvailable(nsIURL* aURL, nsIInputStream *pIStream, PRInt
mParserContext->mTransferBuffer = new char[CParserContext::eTransferBufferSize+1];
while (len > 0) {
- nsresult rv = pIStream->Read(mParserContext->mTransferBuffer, 0, mParserContext->eTransferBufferSize, &len);
+ nsresult rv = pIStream->Read(mParserContext->mTransferBuffer, 0,
+ mParserContext->eTransferBufferSize, &len);
if((rv == NS_OK) && (len>0)) {
-
if(mParserFilter)
mParserFilter->RawBuffer(mParserContext->mTransferBuffer, &len);
- mParserContext->mScanner->Append(mParserContext->mTransferBuffer,len);
+ // XXX kipp was here: this is a temporary piece of code that
+ // fixes up the data in the transfer buffer so that the 8 bit
+ // ascii is mapped to ucs2 properly. The problem is that for the
+ // default character set, some web pages use illegal codes (0x80
+ // to 0x9f, inclusive); we already have code to map entities
+ // properly in this range. This code maps raw stream data the
+ // same way.
+ PRUnichar buf[CParserContext::eTransferBufferSize];
+ PRUnichar* dst = buf;
+ const PRUnichar* table = gToUCS2;
+ const char* src = mParserContext->mTransferBuffer;
+ const char* end = src + len;
+ while (src < end) {
+ unsigned char ch = *(unsigned char*)src;
+ *dst++ = table[ch];
+ src++;
+ }
+
+ mParserContext->mScanner->Append(buf, len);
if(eUnknownDetect==mParserContext->mAutoDetectStatus) {
if(eValidDetect==AutoDetectContentType(mParserContext->mScanner->GetBuffer(),mParserContext->mSourceType)) {
diff --git a/mozilla/parser/htmlparser/src/nsScanner.cpp b/mozilla/parser/htmlparser/src/nsScanner.cpp
index d177926054d..b3fc55b97b8 100644
--- a/mozilla/parser/htmlparser/src/nsScanner.cpp
+++ b/mozilla/parser/htmlparser/src/nsScanner.cpp
@@ -179,6 +179,12 @@ PRBool CScanner::Append(const char* aBuffer, PRInt32 aLen){
return PR_TRUE;
}
+PRBool CScanner::Append(const PRUnichar* aBuffer, PRInt32 aLen){
+ mBuffer.Append(aBuffer,aLen);
+ mTotalRead+=aLen;
+ return PR_TRUE;
+}
+
/**
* Grab data from underlying stream.
*
diff --git a/mozilla/parser/htmlparser/src/nsScanner.h b/mozilla/parser/htmlparser/src/nsScanner.h
index b1196da698b..5950f5122a1 100644
--- a/mozilla/parser/htmlparser/src/nsScanner.h
+++ b/mozilla/parser/htmlparser/src/nsScanner.h
@@ -232,6 +232,8 @@ class CScanner {
*/
PRBool Append(const char* aBuffer, PRInt32 aLen);
+ PRBool Append(const PRUnichar* aBuffer, PRInt32 aLen);
+
/**
*
*