Move the core of NS_ConvertUCS2toUTF8 into character sinks in nsUTF8Utils.h, and use them to make ToNewUTF8String faster. Fix bug in surrogate handling in the moved code. Fix null-termination bug in UTF8ToNewUnicode. b=206682 r=jag sr=alecf a=brendan

git-svn-id: svn://10.0.0.236/trunk@142764 18797224-902f-48f8-a5cc-f745e15eee43
This commit is contained in:
dbaron%dbaron.org
2003-05-22 21:25:43 +00:00
parent 4013205ecb
commit ec7a16fd6d
10 changed files with 424 additions and 258 deletions

View File

@@ -45,6 +45,7 @@
#include "nsString.h"
#include "nsReadableUtils.h"
#include "nsDebug.h"
#include "nsUTF8Utils.h"
#ifndef nsCharTraits_h___
#include "nsCharTraits.h"
@@ -54,8 +55,10 @@
#include "prdtoa.h"
#endif
#ifdef DEBUG
static const char* kPossibleNull = "Error: possible unintended null in string";
static const char* kNullPointerError = "Error: unexpected null ptr";
#endif
static const char* kWhitespace="\b\t\r\n ";
const nsBufferHandle<char>*
@@ -1084,111 +1087,47 @@ PRBool nsCString::EqualsWithConversion(const char* aCString,PRBool aIgnoreCase,P
//----------------------------------------------------------------------
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const nsAString& aString )
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString )
{
nsAString::const_iterator start; aString.BeginReading(start);
nsAString::const_iterator end; aString.EndReading(end);
while (start != end) {
nsReadableFragment<PRUnichar> frag(start.fragment());
Append(frag.mStart, frag.mEnd - frag.mStart);
start.advance(start.size_forward());
}
if (!aString)
// Leave us as an uninitialized nsCAutoString.
return;
Init(nsDependentString(aString));
}
void
NS_ConvertUCS2toUTF8::Append( const PRUnichar* aString, PRUint32 aLength )
NS_ConvertUCS2toUTF8::NS_ConvertUCS2toUTF8( const PRUnichar* aString, PRUint32 aLength )
{
// Handle null string by just leaving us as a brand-new
// uninitialized nsCAutoString.
if (! aString)
if (!aString)
// Leave us as an uninitialized nsCAutoString.
return;
Init(Substring(aString, aString + aLength));
}
// Calculate how many bytes we need
const PRUnichar* p;
PRInt32 count, utf8len;
for (p = aString, utf8len = 0, count = aLength; 0 != count && 0 != (*p); count--, p++)
{
if (! ((*p) & 0xFF80))
utf8len += 1; // 0000 0000 - 0000 007F
else if (! ((*p) & 0xF800))
utf8len += 2; // 0000 0080 - 0000 07FF
else
utf8len += 3; // 0000 0800 - 0000 FFFF
// Note: Surrogate pair needs 4 bytes, but in this calcuation
// we count it as 6 bytes. It will waste 2 bytes per surrogate pair
void NS_ConvertUCS2toUTF8::Init( const nsAString& aString )
{
// Compute space required: do this once so we don't incur multiple
// allocations. This "optimization" is probably of dubious value...
nsAString::const_iterator start, end;
CalculateUTF8Size calculator;
copy_string(aString.BeginReading(start), aString.EndReading(end), calculator);
PRUint32 count = calculator.Size();
if (count) {
// Grow the buffer if we need to.
SetLength(count);
// All ready? Time to convert
ConvertUCS2toUTF8 converter(mStr);
copy_string(aString.BeginReading(start), aString.EndReading(end), converter);
mLength = converter.Size();
if (mLength != count) {
NS_ERROR("Input invalid or incorrect length was calculated");
Truncate();
}
// Make sure our buffer's big enough, so we don't need to do
// multiple allocations.
if(mLength+PRUint32(utf8len+1) > sizeof(mBuffer))
SetCapacity(mLength+utf8len+1);
// |SetCapacity| normally doesn't guarantee the use we are putting it to here (see its interface comment in nsAString.h),
// we can only use it since our local implementation, |nsCString::SetCapacity|, is known to do what we want
char* out = mStr+mLength;
PRUint32 ucs4=0;
for (p = aString, count = aLength; 0 != count && 0 != (*p); count--, p++)
{
if (0 == ucs4)
{
if (! ((*p) & 0xFF80))
{
*out++ = (char)*p;
}
else if (! ((*p) & 0xF800))
{
*out++ = 0xC0 | (char)((*p) >> 6);
*out++ = 0x80 | (char)(0x003F & (*p));
}
else
{
if (0xD800 == (0xFC00 & (*p)))
{
// D800- DBFF - High Surrogate
// N = (H- D800) *400 + 10000 + ...
ucs4 = 0x10000 | ((0x03FF & (*p)) << 10);
}
else if (0xDC00 == (0xFC00 & (*p)))
{
// DC00- DFFF - Low Surrogate
// error here. We should hit High Surrogate first
// Do not output any thing in this case
}
else
{
*out++ = 0xE0 | (char)((*p) >> 12);
*out++ = 0x80 | (char)(0x003F & (*p >> 6));
*out++ = 0x80 | (char)(0x003F & (*p) );
}
}
}
else
{
if (0xDC00 == (0xFC00 & (*p)))
{
// DC00- DFFF - Low Surrogate
// N += ( L - DC00 )
ucs4 |= (0x03FF & (*p));
// 0001 0000-001F FFFF
*out++ = 0xF0 | (char)(ucs4 >> 18);
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
*out++ = 0x80 | (char)(0x003F & ucs4) ;
}
else
{
// Got a High Surrogate but no low surrogate
// output nothing.
}
ucs4 = 0;
}
}
*out = '\0'; // null terminate
mLength += utf8len;
}
}
NS_LossyConvertUCS2toASCII::NS_LossyConvertUCS2toASCII( const nsAString& aString )