From fa357fa3282193ddf7cb6033a84498dd65b51a43 Mon Sep 17 00:00:00 2001 From: "jbetak%netscape.com" Date: Sat, 5 Feb 2000 02:20:37 +0000 Subject: [PATCH] bug #8702; r=ftang, cata; fixed performance problems in UTF8 Unicode decoder git-svn-id: svn://10.0.0.236/trunk@59833 18797224-902f-48f8-a5cc-f745e15eee43 --- .../intl/uconv/ucvlatin/nsUTF8ToUnicode.cpp | 148 ++++++++++++++++-- mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.h | 21 ++- 2 files changed, 153 insertions(+), 16 deletions(-) diff --git a/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.cpp b/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.cpp index 4209d950de8..ead67c93541 100644 --- a/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.cpp +++ b/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.cpp @@ -22,29 +22,18 @@ #include "nsUTF8ToUnicode.h" -//---------------------------------------------------------------------- -// Global functions and data [declaration] - -static PRUint16 g_UTF8MappingTable[] = { - 0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0xFFFF, 0x0000 -}; - -static PRInt16 g_UTF8ShiftTable[] = { - 3, uMultibytesCharset, - ShiftCell(u1ByteChar, 1, 0x00, 0x7F, 0x00, 0x00, 0x00, 0x7F), - ShiftCell(u2BytesUTF8, 2, 0xC0, 0xDF, 0x00, 0x00, 0x07, 0xFF), - ShiftCell(u3BytesUTF8, 3, 0xE0, 0xEF, 0x08, 0x00, 0xFF, 0xFF) -}; //---------------------------------------------------------------------- // Class nsUTF8ToUnicode [implementation] nsUTF8ToUnicode::nsUTF8ToUnicode() -: nsTableDecoderSupport((uShiftTable*) &g_UTF8ShiftTable, - (uMappingTable*) &g_UTF8MappingTable) +: nsBasicDecoderSupport() + { + Reset(); } + nsresult nsUTF8ToUnicode::CreateInstance(nsISupports ** aResult) { *aResult = new nsUTF8ToUnicode(); @@ -61,3 +50,132 @@ NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc, *aDestLength = aSrcLength; return NS_OK; } + + +//---------------------------------------------------------------------- +// Subclassing of nsBasicDecoderSupport class [implementation] + + NS_IMETHODIMP nsUTF8ToUnicode::Reset() +{ + + mState = 0; // cached expected number of bytes per UTF8 character sequence + mUcs4 = 0; // cached Unicode character + return NS_OK; + +} + +//---------------------------------------------------------------------- +// Subclassing of nsBasicDecoderSupport class [implementation] + + + NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, + PRInt32 * aSrcLength, + PRUnichar * aDest, + PRInt32 * aDestLength) + { + + PRUint32 aSrcLen = (PRUint32) (*aSrcLength); + PRUint32 aDestLen = (PRUint32) (*aDestLength); + + const char *in, *inend; + inend = aSrc + aSrcLen; + + PRUnichar *out, *outend; + outend = aDest + aDestLen; + + nsresult res; // conversion result + + for(in=aSrc,out=aDest,res=nsnull;((in < inend) && (out < outend)); in++) + { + if(0 == mState) { + if( 0 == (0x80 & (*in))) { + // ASCII + *out++ = (PRUnichar)*in; + } else if( 0xC0 == (0xE0 & (*in))) { + // 2 bytes UTF8 + mUcs4 = (PRUint32)(*in); + mUcs4 = (mUcs4 << 6) & 0x000007C0L; + mState=1; + } else if( 0xE0 == (0xF0 & (*in))) { + // 3 bytes UTF8 + mUcs4 = (PRUint32)(*in); + mUcs4 = (mUcs4 << 12) & 0x0000F000L; + mState=2; + } else if( 0xF0 == (0xF8 & (*in))) { + // 4 bytes UTF8 + mUcs4 = (PRUint32)(*in); + mUcs4 = (mUcs4 << 18) & 0x001F0000L; + mState=3; + } else if( 0xF8 == (0xFC & (*in))) { + // 5 bytes UTF8 + mUcs4 = (PRUint32)(*in); + mUcs4 = (mUcs4 << 24) & 0x03000000L; + mState=4; + } else if( 0xFC == (0xFE & (*in))) { + // 6 bytes UTF8 + mUcs4 = (PRUint32)(*in); + mUcs4 = (mUcs4 << 30) & 0x40000000L; + mState=5; + } else { + + //NS_ASSERTION(0, "The input string is not in utf8"); + + //unexpected octet, put in a replacement char, + //flush and refill the buffer, reset state + res = NS_ERROR_UNEXPECTED; + break; + + } + + } else { + + if(0x80 == (0xC0 & (*in))) + { + PRUint32 tmp = (*in); + int shift = (mState-1) * 6; + tmp = (tmp << shift ) & ( 0x0000003FL << shift); + mUcs4 |= tmp; + if(0 == --mState) + { + if(mUcs4 >= 0x00010000) { + if(mUcs4 >= 0x001F0000) { + *out++ = 0xFFFD; + } else { + mUcs4 -= 0x00010000; + *out++ = 0xD800 | (0x000003FF & (mUcs4 >> 10)); + *out++ = 0xDC00 | (0x000003FF & mUcs4); + } + } else { + *out++ = mUcs4; + } + + //initialize UTF8 cache + Reset(); + } + + } else { + + //NS_ASSERTION(0, "The input string is not in utf8"); + + //unexpected octet, put in a replacement char, + //flush and refill the buffer, reset state + res = NS_ERROR_UNEXPECTED; + break; + + } + } + } + + //output not finished, output buffer too short + if ((in < inend) && (out >= outend)) res = NS_OK_UDEC_MOREOUTPUT; + + //last USC4 is incomplete, make sure the caller + //returns with properly aligned continuation of the buffer + if (mState != 0) res = NS_OK_UDEC_MOREINPUT; + + *aSrcLength = in - aSrc; + *aDestLength = out - aDest; + + return(res); + + } diff --git a/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.h b/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.h index 4aab990c4d7..d138516c53c 100644 --- a/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.h +++ b/mozilla/intl/uconv/ucvlatin/nsUTF8ToUnicode.h @@ -28,13 +28,16 @@ //---------------------------------------------------------------------- // Class nsUTF8ToUnicode [declaration] + /** * A character set converter from UTF8 to Unicode. * * @created 18/Mar/1998 + * @modified 04/Feb/2000 * @author Catalin Rotaru [CATA] */ -class nsUTF8ToUnicode : public nsTableDecoderSupport + +class nsUTF8ToUnicode : public nsBasicDecoderSupport { public: @@ -43,6 +46,7 @@ public: */ nsUTF8ToUnicode(); + /** * Static class constructor. */ @@ -50,11 +54,26 @@ public: protected: + PRUint32 mState; // cached expected number of bytes per UTF8 character sequence + PRUint32 mUcs4; // cached Unicode character + //-------------------------------------------------------------------- // Subclassing of nsDecoderSupport class [declaration] NS_IMETHOD GetMaxLength(const char * aSrc, PRInt32 aSrcLength, PRInt32 * aDestLength); + + //-------------------------------------------------------------------- + // Subclassing of nsBasicDecoderSupport class [declaration] + + NS_IMETHOD Convert(const char * aSrc, PRInt32 * aSrcLength, + PRUnichar * aDest, PRInt32 * aDestLength); + + //-------------------------------------------------------------------- + // Subclassing of nsBasicDecoderSupport class [declaration] + + NS_IMETHOD Reset(); + }; #endif /* nsUTF8ToUnicode_h___ */