summaryrefslogtreecommitdiff
path: root/cpp
diff options
context:
space:
mode:
authorBernard Normier <bernard@zeroc.com>2016-06-04 16:18:18 -0400
committerBernard Normier <bernard@zeroc.com>2016-06-04 16:18:18 -0400
commita59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
treee37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp
parentMore UTF tests (diff)
downloadice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip
UnicodeWstringConverter performance improvement and cleanup
Diffstat (limited to 'cpp')
-rw-r--r--cpp/src/IceUtil/StringConverter.cpp123
-rw-r--r--cpp/src/IceUtil/Unicode.cpp187
-rw-r--r--cpp/test/IceUtil/unicode/Client.cpp465
3 files changed, 382 insertions, 393 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
index 8b60c48d53f..cb15037ecfa 100644
--- a/cpp/src/IceUtil/StringConverter.cpp
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -43,11 +43,7 @@ struct SelectCodeCvt;
template<>
struct SelectCodeCvt<2>
{
-#ifdef ICE_LITTLE_ENDIAN
- typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type;
-#else
typedef std::codecvt_utf8_utf16<wchar_t> Type;
-#endif
};
template<>
@@ -155,28 +151,35 @@ public:
virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
{
- if(sourceStart == sourceEnd)
+ const size_t sourceSize = sourceEnd - sourceStart;
+
+ if(sourceSize == 0)
{
target = L"";
}
else
{
- //
- // TODO: consider reimplementing without the wstring_convert helper
- // to improve performance
- // Note that wstring_convert is "stateful" and cannot be a shared data member
- //
- wstring_convert<CodeCvt> convert;
-
- try
- {
- target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart),
- reinterpret_cast<const char*>(sourceEnd));
- }
- catch(const std::range_error& ex)
+ target.resize(sourceSize);
+ wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+ wchar_t* targetEnd = targetStart + sourceSize;
+ wchar_t* targetNext = targetStart;
+
+ const char* sourceNext = reinterpret_cast<const char*>(sourceStart);
+
+ mbstate_t state = mbstate_t();
+
+ codecvt_base::result result = _codecvt.in(state,
+ reinterpret_cast<const char*>(sourceStart),
+ reinterpret_cast<const char*>(sourceEnd),
+ sourceNext,
+ targetStart, targetEnd, targetNext);
+
+ if(result != codecvt_base::ok)
{
- throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+ throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure");
}
+
+ target.resize(targetNext - targetStart);
}
}
@@ -215,14 +218,12 @@ public:
targetStart = buffer.getMoreBytes(chunkSize, targetStart);
targetEnd = targetStart + chunkSize;
-
}
while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false);
return targetStart;
}
-
virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
{
if(sourceStart == sourceEnd)
@@ -290,67 +291,38 @@ getUnicodeWstringConverter()
return unicodeWstringConverter;
}
-
class UTF8BufferI : public UTF8Buffer
{
public:
- UTF8BufferI() :
- _buffer(0),
- _offset(0)
- {
- }
-
- ~UTF8BufferI()
- {
- free(_buffer);
- }
-
+ //
+ // Returns the first unused byte in the resized buffer
+ //
Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
{
- if(_buffer == 0)
- {
- _buffer = static_cast<Byte*>(malloc(howMany));
- if(!_buffer)
- {
- throw std::bad_alloc();
- }
- }
- else
+ size_t bytesUsed = 0;
+ if(firstUnused != 0)
{
- assert(firstUnused != 0);
- _offset = firstUnused - _buffer;
- Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
- if(!newBuffer)
- {
- reset();
- throw std::bad_alloc();
- }
- else
- {
- _buffer = newBuffer;
- }
+ bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data());
}
- return _buffer + _offset;
- }
-
- Byte* getBuffer()
- {
- return _buffer;
+ if(_buffer.size() < howMany + bytesUsed)
+ {
+ _buffer.resize(bytesUsed + howMany);
+ }
+
+ return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed;
}
- void reset()
+ void swap(string& other, const Byte* tail)
{
- free(_buffer);
- _buffer = 0;
- _offset = 0;
+ assert(tail >= reinterpret_cast<const Byte*>(_buffer.data()));
+ _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data()));
+ other.swap(_buffer);
}
private:
-
- Byte* _buffer;
- size_t _offset;
+ string _buffer;
};
#ifdef _WIN32
@@ -516,8 +488,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
//
UTF8BufferI buffer;
Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer);
- target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
-
+ buffer.swap(target, last);
+
//
// If narrow string converter is present convert to the native narrow string encoding, otherwise
// native narrow string encoding is UTF8 and we are done.
@@ -534,8 +506,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
}
wstring
-IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
- const WstringConverterPtr& wConverter)
+IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter)
{
wstring target;
if(!v.empty())
@@ -549,7 +520,7 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
{
UTF8BufferI buffer;
Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
- tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+ buffer.swap(tmp, last);
}
else
{
@@ -577,7 +548,9 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv
}
UTF8BufferI buffer;
Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
- return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+ string result;
+ buffer.swap(result, last);
+ return result;
}
string
@@ -620,11 +593,7 @@ IceUtilInternal::toUTF16(const vector<Byte>& source)
#ifdef ICE_HAS_CODECVT_UTF8
assert(sizeof(Char16T) == sizeof(unsigned short));
-#ifdef ICE_LITTLE_ENDIAN
- typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert;
-#else
typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert;
-#endif
Convert convert;
diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp
index 22ced7e61b2..4db36d29e9d 100644
--- a/cpp/src/IceUtil/Unicode.cpp
+++ b/cpp/src/IceUtil/Unicode.cpp
@@ -26,90 +26,80 @@ using namespace IceUtilInternal;
namespace
{
- //
- // Helper class, base never defined
- // Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
- //
- template<size_t wcharSize>
- struct WstringHelper
- {
- static ConversionResult toUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd);
+//
+// Helper class, base never defined
+// Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
+//
+template<size_t wcharSize> struct WstringHelper;
- static ConversionResult fromUTF8(
- const Byte*& sourceStart, const Byte* sourceEnd,
- wchar_t*& targetStart, wchar_t* targetEnd);
- };
- template<>
- struct WstringHelper<2>
- {
- static ConversionResult toUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd)
- {
- return ConvertUTF16toUTF8(
- reinterpret_cast<const UTF16**>(&sourceStart),
- reinterpret_cast<const UTF16*>(sourceEnd),
- &targetStart, targetEnd, lenientConversion);
- }
-
- static ConversionResult fromUTF8(
- const Byte*& sourceStart, const Byte* sourceEnd,
- wchar_t*& targetStart, wchar_t* targetEnd)
- {
- return ConvertUTF8toUTF16(
- &sourceStart, sourceEnd,
- reinterpret_cast<UTF16**>(&targetStart),
- reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
- }
- };
-
- template<>
- struct WstringHelper<4>
- {
- static ConversionResult toUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd)
- {
- return ConvertUTF32toUTF8(
- reinterpret_cast<const UTF32**>(&sourceStart),
- reinterpret_cast<const UTF32*>(sourceEnd),
- &targetStart, targetEnd, lenientConversion);
- }
-
- static ConversionResult fromUTF8(
- const Byte*& sourceStart, const Byte* sourceEnd,
- wchar_t*& targetStart, wchar_t* targetEnd)
- {
- return ConvertUTF8toUTF32(
- &sourceStart, sourceEnd,
- reinterpret_cast<UTF32**>(&targetStart),
- reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
- }
- };
-
- void
- checkResult(ConversionResult result)
+template<>
+struct WstringHelper<2>
+{
+ static ConversionResult toUTF8(
+ const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+ Byte*& targetStart, Byte* targetEnd)
+ {
+ return ConvertUTF16toUTF8(
+ reinterpret_cast<const UTF16**>(&sourceStart),
+ reinterpret_cast<const UTF16*>(sourceEnd),
+ &targetStart, targetEnd, lenientConversion);
+ }
+
+ static ConversionResult fromUTF8(
+ const Byte*& sourceStart, const Byte* sourceEnd,
+ wchar_t*& targetStart, wchar_t* targetEnd)
+ {
+ return ConvertUTF8toUTF16(
+ &sourceStart, sourceEnd,
+ reinterpret_cast<UTF16**>(&targetStart),
+ reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
+ }
+};
+
+template<>
+struct WstringHelper<4>
+{
+ static ConversionResult toUTF8(
+ const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+ Byte*& targetStart, Byte* targetEnd)
+ {
+ return ConvertUTF32toUTF8(
+ reinterpret_cast<const UTF32**>(&sourceStart),
+ reinterpret_cast<const UTF32*>(sourceEnd),
+ &targetStart, targetEnd, lenientConversion);
+ }
+
+ static ConversionResult fromUTF8(
+ const Byte*& sourceStart, const Byte* sourceEnd,
+ wchar_t*& targetStart, wchar_t* targetEnd)
+ {
+ return ConvertUTF8toUTF32(
+ &sourceStart, sourceEnd,
+ reinterpret_cast<UTF32**>(&targetStart),
+ reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
+ }
+};
+
+void checkResult(ConversionResult result)
+{
+ switch (result)
+ {
+ case conversionOK:
+ break;
+ case sourceExhausted:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
+ case sourceIllegal:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+ case targetExhausted:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+ default:
{
- switch (result)
- {
- case conversionOK:
- break;
- case sourceExhausted:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
- case sourceIllegal:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
- case targetExhausted:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
- default:
- {
- assert(0);
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
- }
- }
+ assert(0);
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
}
+ }
+}
}
//
@@ -117,9 +107,8 @@ namespace
//
bool
-IceUtilInternal::convertUTFWstringToUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd)
+IceUtilInternal::convertUTFWstringToUTF8(const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+ Byte*& targetStart, Byte* targetEnd)
{
ConversionResult result = WstringHelper<sizeof(wchar_t)>::toUTF8(
sourceStart, sourceEnd, targetStart, targetEnd);
@@ -135,30 +124,20 @@ IceUtilInternal::convertUTFWstringToUTF8(
}
}
-
void
-IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd,
- std::wstring& target)
+IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, std::wstring& target)
{
- //
- // Could be reimplemented without this temporary wchar_t buffer
- //
- size_t size = static_cast<size_t>(sourceEnd - sourceStart);
- wchar_t* outBuf = new wchar_t[size];
- wchar_t* targetStart = outBuf;
- wchar_t* targetEnd = targetStart + size;
-
- ConversionResult result =
- WstringHelper<sizeof(wchar_t)>::fromUTF8(
- sourceStart, sourceEnd, targetStart, targetEnd);
-
- if(result == conversionOK)
- {
- std::wstring s(outBuf, static_cast<size_t>(targetStart - outBuf));
- s.swap(target);
- }
- delete[] outBuf;
+ size_t sourceSize = static_cast<size_t>(sourceEnd - sourceStart);
+
+ target.resize(sourceSize);
+ wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+ wchar_t* targetEnd = targetStart + sourceSize;
+
+ ConversionResult result = WstringHelper<sizeof(wchar_t)>::fromUTF8(sourceStart, sourceEnd,
+ targetStart, targetEnd);
+
checkResult(result);
+ target.resize(targetStart - target.data());
}
void
diff --git a/cpp/test/IceUtil/unicode/Client.cpp b/cpp/test/IceUtil/unicode/Client.cpp
index b3d3912057b..64c8fe5f0dd 100644
--- a/cpp/test/IceUtil/unicode/Client.cpp
+++ b/cpp/test/IceUtil/unicode/Client.cpp
@@ -17,6 +17,9 @@
#endif
#include <fstream>
+// Uncomment to include performance testing
+//#define TEST_PERF
+
using namespace IceUtil;
using namespace std;
@@ -45,14 +48,14 @@ main(int argc, char* argv[])
#ifdef _WIN32
# ifdef __MINGW32__
- dir = argv[1];
+ dir = argv[1];
# else
- dir = IceUtil::wstringToString(argv[1]);
+ dir = IceUtil::wstringToString(argv[1]);
# endif
- dir += "\\";
+ dir += "\\";
#else
- dir = argv[1];
- dir += "/";
+ dir = argv[1];
+ dir += "/";
#endif
}
@@ -67,226 +70,264 @@ main(int argc, char* argv[])
string wcoeurFile = string("coeur.") + wstringEncoding;
{
- cout << "testing UTF-8 to wstring (" << wstringEncoding << ") conversion... ";
- ifstream is((dir + "coeur.utf8").c_str());
- test(is.good());
- ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
- test(bis.good());
-
- int lineNumber = 0;
-
- do
- {
- string line;
- getline(is, line, '\n');
- lineNumber++;
- wstring wline = stringToWstring(line);
-
- for(size_t i = 0; i < wline.length(); ++i)
- {
- wchar_t wc = wline[i];
- const char* buffer = reinterpret_cast<char*>(&wc);
- for(size_t j = 0; j < sizeof(wchar_t); ++j)
- {
- test(bis.good());
- char c;
- bis.get(c);
- if(buffer[j] != c)
- {
- cerr << "Error at line " << lineNumber << " column " << i << endl;
- cerr << "buffer[j] == " << hex << (int)static_cast<unsigned char>(buffer[j]) << endl;
- cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
- }
- test(buffer[j] == c);
- }
- }
- //
- // Skip newline character (Unix-style newline)
- //
- if(is.good())
- {
- for(size_t j = 0; j < sizeof(wchar_t); ++j)
- {
- test(bis.good());
- char c;
- bis.get(c);
- }
- }
- else
- {
- char c;
- bis.get(c);
- test(bis.eof());
- }
- } while(is.good());
-
- cout << "ok" << endl;
+ cout << "testing UTF-8 to wstring (" << wstringEncoding << ") conversion... ";
+ ifstream is((dir + "coeur.utf8").c_str());
+ test(is.good());
+ ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
+ test(bis.good());
+
+ int lineNumber = 0;
+
+ do
+ {
+ string line;
+ getline(is, line, '\n');
+ lineNumber++;
+ wstring wline = stringToWstring(line);
+
+ for(size_t i = 0; i < wline.length(); ++i)
+ {
+ wchar_t wc = wline[i];
+ const char* buffer = reinterpret_cast<char*>(&wc);
+ for(size_t j = 0; j < sizeof(wchar_t); ++j)
+ {
+ test(bis.good());
+ char c;
+ bis.get(c);
+ if(buffer[j] != c)
+ {
+ cerr << "Error at line " << lineNumber << " column " << i << endl;
+ cerr << "buffer[j] == " << hex << (int)static_cast<unsigned char>(buffer[j]) << endl;
+ cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
+ }
+ test(buffer[j] == c);
+ }
+ }
+ //
+ // Skip newline character (Unix-style newline)
+ //
+ if(is.good())
+ {
+ for(size_t j = 0; j < sizeof(wchar_t); ++j)
+ {
+ test(bis.good());
+ char c;
+ bis.get(c);
+ }
+ }
+ else
+ {
+ char c;
+ bis.get(c);
+ test(bis.eof());
+ }
+ } while(is.good());
+
+ cout << "ok" << endl;
+ }
+
+ {
+ cout << "testing wstring (" << wstringEncoding << ") to UTF-8 conversion... ";
+
+ ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
+ test(bis.good());
+
+ wstring ws;
+ char c;
+
+ do
+ {
+ wchar_t wc;
+ char* buffer = reinterpret_cast<char*>(&wc);
+
+ for(size_t j = 0; j < sizeof(wchar_t); ++j)
+ {
+ if(!bis.good())
+ {
+ break;
+ }
+ bis.get(c);
+ buffer[j] = c;
+ }
+
+ if(bis.good())
+ {
+ ws.push_back(wc);
+ }
+ } while(bis.good());
+
+ string s = wstringToString(ws);
+
+ ifstream nbis((dir + "coeur.utf8").c_str(), ios_base::binary);
+ test(nbis.good());
+
+ for(size_t i = 0; i < s.size(); ++i)
+ {
+ test(nbis.good());
+ nbis.get(c);
+ char ci = s[i];
+
+ if(c != ci)
+ {
+ cerr << "i == " << i << endl;
+ cerr << "ci == " << hex << (int)static_cast<unsigned char>(ci) << endl;
+ cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
+ }
+ test(c == s[i]);
+ }
+ test(!nbis.eof());
+ nbis.get(c);
+ test(nbis.eof());
+
+ cout << "ok" << endl;
}
{
- cout << "testing wstring (" << wstringEncoding << ") to UTF-8 conversion... ";
-
- ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
- test(bis.good());
-
- wstring ws;
- char c;
-
- do
- {
- wchar_t wc;
- char* buffer = reinterpret_cast<char*>(&wc);
-
- for(size_t j = 0; j < sizeof(wchar_t); ++j)
- {
- if(!bis.good())
- {
- break;
- }
- bis.get(c);
- buffer[j] = c;
- }
-
- if(bis.good())
- {
- ws.push_back(wc);
- }
- } while(bis.good());
-
- string s = wstringToString(ws);
-
- ifstream nbis((dir + "coeur.utf8").c_str(), ios_base::binary);
- test(nbis.good());
-
- for(size_t i = 0; i < s.size(); ++i)
- {
- test(nbis.good());
- nbis.get(c);
- char ci = s[i];
-
- if(c != ci)
- {
- cerr << "i == " << i << endl;
- cerr << "ci == " << hex << (int)static_cast<unsigned char>(ci) << endl;
- cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
- }
- test(c == s[i]);
- }
- test(!nbis.eof());
- nbis.get(c);
- test(nbis.eof());
-
- cout << "ok" << endl;
+ cout << "testing wstring with surrogates... ";
+
+ //
+ // Euro sign (U+20AC) is encoded with 1 UTF-16 code unit, and 3 UTF-8 code units
+ // U+10437 is a Deseret character, encoded with 2 UTF-16 code units, and 4 UTF-8 code units
+ //
+ wstring ws = L"\u20ac\u20ac\U00010437";
+
+ if(sizeof(wchar_t) == 2)
+ {
+ test(ws.length() == 4);
+ }
+ else
+ {
+ test(sizeof(wchar_t) == 4);
+ test(ws.length() == 3);
+ }
+
+ //
+ // The Unicode string converter implementation allocates an initial buffer
+ // of size max(2 * (sourceEnd - sourceStart), 4).
+ // With UTF-16 encoding, that's 8 and the first 2 euros will use the first 6
+ // bytes of the initial buffer.
+
+ string ns = IceUtil::wstringToString(ws);
+
+ const string good = "\xE2\x82\xAC\xE2\x82\xAC\xF0\x90\x90\xB7";
+ test(ns == good);
+ test(ws == IceUtil::stringToWstring(ns));
+
+ cout << "ok" << endl;
+
+ cout << "testing IceUtilInternal::toUTF16, toUTF32 and fromUTF32... ";
+
+ vector<Byte> u8 = vector<Byte>(reinterpret_cast<const Byte*>(ns.data()),
+ reinterpret_cast<const Byte*>(ns.data() + ns.length()));
+
+ vector<unsigned short> u16 = IceUtilInternal::toUTF16(u8);
+ test(u16.size() == 4);
+ test(u16[0] == 0x20ac);
+ test(u16[1] == 0x20ac);
+ test(u16[2] == 0xd801);
+ test(u16[3] == 0xdc37);
+
+ vector<unsigned int> u32 = IceUtilInternal::toUTF32(u8);
+ test(u32.size() == 3);
+ test(u32[0] == 0x20ac);
+ test(u32[1] == 0x20ac);
+ test(u32[2] == 0x10437);
+
+ vector<Byte> nu8 = IceUtilInternal::fromUTF32(u32);
+ test(nu8 == u8);
+
+ cout << "ok" << endl;
}
+#ifdef TEST_PERF
{
- cout << "testing wstring with surrogates... ";
-
- //
- // Euro sign (U+20AC) is encoded with 1 UTF-16 code unit, and 3 UTF-8 code units
- // U+10437 is a Deseret character, encoded with 2 UTF-16 code units, and 4 UTF-8 code units
- //
- wstring ws = L"\u20ac\u20ac\U00010437";
-
- if(sizeof(wchar_t) == 2)
- {
- test(ws.length() == 4);
- }
- else
- {
- test(sizeof(wchar_t) == 4);
- test(ws.length() == 3);
- }
-
- //
- // The Unicode string converter implementation allocates an initial buffer
- // of size max(2 * (sourceEnd - sourceStart), 4).
- // With UTF-16 encoding, that's 8 and the first 2 euros will use the first 6
- // bytes of the initial buffer.
-
- string ns = IceUtil::wstringToString(ws);
-
- const string good = "\xE2\x82\xAC\xE2\x82\xAC\xF0\x90\x90\xB7";
- test(ns == good);
- test(ws == IceUtil::stringToWstring(ns));
-
- vector<Byte> u8 = vector<Byte>(reinterpret_cast<const Byte*>(ns.data()),
- reinterpret_cast<const Byte*>(ns.data() + ns.length()));
-
- vector<unsigned short> u16 = IceUtilInternal::toUTF16(u8);
- test(u16.size() == 4);
- test(u16[0] == 0x20ac);
- test(u16[1] == 0x20ac);
- test(u16[2] == 0xd801);
- test(u16[3] == 0xdc37);
-
- vector<unsigned int> u32 = IceUtilInternal::toUTF32(u8);
- test(u32.size() == 3);
- test(u32[0] == 0x20ac);
- test(u32[1] == 0x20ac);
- test(u32[2] == 0x10437);
-
- vector<Byte> nu8 = IceUtilInternal::fromUTF32(u32);
- test(nu8 == u8);
-
- cout << "ok" << endl;
+ // The only performance-critical code is the UnicodeWstringConverter
+ // that is used whenever we marshal/unmarshal wstrings.
+
+ const long iterations = 5000000;
+ const wstring ws = L"abcdefghijklmnopqrstuvwxyz+\u20ac\u20ac\U00010437";
+ const string ns = IceUtil::wstringToString(ws);
+ test(IceUtil::stringToWstring(ns) == ws);
+
+ cout << "testing performance with " << iterations << " iterations... ";
+
+ IceUtil::Time toU8 = IceUtil::Time::now(IceUtil::Time::Monotonic);
+ for(long i = 0; i < iterations; ++i)
+ {
+ test(IceUtil::wstringToString(ws) == ns);
+ }
+ IceUtil::Time now = IceUtil::Time::now(IceUtil::Time::Monotonic);
+ toU8 = now - toU8;
+
+ IceUtil::Time fromU8 = now;
+ for(long i = 0; i < iterations; ++i)
+ {
+ test(IceUtil::stringToWstring(ns) == ws);
+ }
+ fromU8 = IceUtil::Time::now(IceUtil::Time::Monotonic) - fromU8;
+
+ cout << "toUTF8 = " << toU8 * 1000 << " ms; fromUTF8 = "
+ << fromU8 * 1000 << " ms ok" << endl;
}
+#endif
+
+
{
- cout << "testing error handling... ";
-
- // From http://stackoverflow.com/questions/1301402/example-invalid-utf8-string
-
- string badUTF8[] = {
- "\xc3\x28",
- "\xa0\xa1",
- "\xe2\x28\xa1",
- "\xe2\x82\x28",
- "\xf0\x28\x8c\xbc",
- "\xf0\x90\x28\xbc",
- "\xf0\x28\x8c\x28",
- "\xf8\xa1\xa1\xa1\xa1",
- "\xfc\xa1\xa1\xa1\xa1\xa1",
- ""
- };
-
- for(size_t i = 0; badUTF8[i] != ""; ++i)
- {
- try
- {
- wstring ws = IceUtil::stringToWstring(badUTF8[i]);
- wcerr << L"Unexpected: " << ws << endl;
- test(false);
- }
- catch(const IceUtil::IllegalConversionException&)
- {}
- }
-
- // TODO: need test for bad UTF-32 strings
+ cout << "testing error handling... ";
+
+ // From http://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+
+ string badUTF8[] = {
+ "\xc3\x28",
+ "\xa0\xa1",
+ "\xe2\x28\xa1",
+ "\xe2\x82\x28",
+ "\xf0\x28\x8c\xbc",
+ "\xf0\x90\x28\xbc",
+ "\xf0\x28\x8c\x28",
+ "\xf8\xa1\xa1\xa1\xa1",
+ "\xfc\xa1\xa1\xa1\xa1\xa1",
+ ""
+ };
+
+ for(size_t i = 0; badUTF8[i] != ""; ++i)
+ {
+ try
+ {
+ wstring ws = IceUtil::stringToWstring(badUTF8[i]);
+ wcerr << L"Unexpected: " << ws << endl;
+ test(false);
+ }
+ catch(const IceUtil::IllegalConversionException&)
+ {}
+ }
+
+ // TODO: need test for bad UTF-32 strings
#ifdef _WIN32
- // Note: for an unknown reason, the conversion works without
- // the extra letter (x below) when using codecvt_utf8_utf16.
-
- wstring badWstring[] = {
- wstring(1, wchar_t(0xD800)) + L"x",
- wstring(2, wchar_t(0xDB7F)),
- L""
- };
-
- for(size_t i = 0; badWstring[i] != L""; ++i)
- {
- try
- {
- string s = IceUtil::wstringToString(badWstring[i]);
- test(false);
- }
- catch(const IceUtil::IllegalConversionException&)
- {}
- }
+ // Note: for an unknown reason, the conversion works without
+ // the extra letter (x below) when using codecvt_utf8_utf16.
+
+ wstring badWstring[] = {
+ wstring(1, wchar_t(0xD800)) + L"x",
+ wstring(2, wchar_t(0xDB7F)),
+ L""
+ };
+
+ for(size_t i = 0; badWstring[i] != L""; ++i)
+ {
+ try
+ {
+ string s = IceUtil::wstringToString(badWstring[i]);
+ test(false);
+ }
+ catch(const IceUtil::IllegalConversionException&)
+ {}
+ }
#endif
- cout << "ok" << endl;
+ cout << "ok" << endl;
}
return EXIT_SUCCESS;