UnicodeWstringConverter performance improvement and cleanup

author: Bernard Normier <bernard@zeroc.com> 2016-06-04 16:18:18 -0400
committer: Bernard Normier <bernard@zeroc.com> 2016-06-04 16:18:18 -0400
commit: a59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
tree: e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp
parent: More UTF tests (diff)
download: ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip
3 files changed, 382 insertions, 393 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
index 8b60c48d53f..cb15037ecfa 100644
--- a/cpp/src/IceUtil/StringConverter.cpp
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -43,11 +43,7 @@ struct SelectCodeCvt;
 template<>
 struct SelectCodeCvt<2>
 {
-#ifdef ICE_LITTLE_ENDIAN
-    typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type;
-#else
     typedef std::codecvt_utf8_utf16<wchar_t> Type;
-#endif
 };
 
 template<>
@@ -155,28 +151,35 @@ public:
 
     virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
     {
-        if(sourceStart == sourceEnd)
+        const size_t sourceSize = sourceEnd - sourceStart;
+
+        if(sourceSize == 0)
         {
             target = L"";
         }
         else
         {
-            //
-            // TODO: consider reimplementing without the wstring_convert helper
-            // to improve performance
-            // Note that wstring_convert is "stateful" and cannot be a shared data member
-            //
-            wstring_convert<CodeCvt> convert;
-
-            try
-            {
-                target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart),
-                                            reinterpret_cast<const char*>(sourceEnd));
-            }
-            catch(const std::range_error& ex)
+            target.resize(sourceSize);
+            wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+            wchar_t* targetEnd = targetStart + sourceSize;
+            wchar_t* targetNext = targetStart;
+
+            const char* sourceNext = reinterpret_cast<const char*>(sourceStart);
+
+            mbstate_t state = mbstate_t();
+
+            codecvt_base::result result = _codecvt.in(state,
+                                                      reinterpret_cast<const char*>(sourceStart),
+                                                      reinterpret_cast<const char*>(sourceEnd),
+                                                      sourceNext,
+                                                      targetStart, targetEnd, targetNext);
+
+            if(result != codecvt_base::ok)
             {
-                throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+                throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure");
             }
+
+            target.resize(targetNext - targetStart);
         }
     }
 
@@ -215,14 +218,12 @@ public:
 
             targetStart = buffer.getMoreBytes(chunkSize, targetStart);
             targetEnd = targetStart + chunkSize;
-
         }
         while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false);
 
         return targetStart;
     }
 
-
     virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
     {
         if(sourceStart == sourceEnd)
@@ -290,67 +291,38 @@ getUnicodeWstringConverter()
     return unicodeWstringConverter;
 }
 
-
 class UTF8BufferI : public UTF8Buffer
 {
 public:
 
-    UTF8BufferI() :
-        _buffer(0),
-        _offset(0)
-    {
-    }
-
-    ~UTF8BufferI()
-    {
-        free(_buffer);
-    }
-
+    //
+    // Returns the first unused byte in the resized buffer
+    // 
     Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
     {
-        if(_buffer == 0)
-        {
-            _buffer = static_cast<Byte*>(malloc(howMany));
-            if(!_buffer)
-            {
-                throw std::bad_alloc();
-            }
-        }
-        else
+        size_t bytesUsed = 0;
+        if(firstUnused != 0)
         {
-            assert(firstUnused != 0);
-            _offset = firstUnused - _buffer;
-            Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
-            if(!newBuffer)
-            {
-                reset();
-                throw std::bad_alloc();
-            }
-            else
-            {
-                _buffer = newBuffer;
-            }
+            bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data());
         }
 
-        return _buffer + _offset;
-    }
-
-    Byte* getBuffer()
-    {
-        return _buffer;
+        if(_buffer.size() < howMany + bytesUsed)
+        {
+            _buffer.resize(bytesUsed + howMany);
+        } 
+        
+        return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed;
     }
 
-    void reset()
+    void swap(string& other, const Byte* tail)
     {
-        free(_buffer);
-        _buffer = 0;
-        _offset = 0;
+        assert(tail >= reinterpret_cast<const Byte*>(_buffer.data()));
+        _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data()));
+        other.swap(_buffer);
     }
 
 private:
-
-    Byte* _buffer;
-    size_t _offset;
+    string _buffer;
 };
 
 #ifdef _WIN32
@@ -516,8 +488,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
         //
         UTF8BufferI buffer;
         Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer);
-        target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
-
+        buffer.swap(target, last);
+       
         //
         // If narrow string converter is present convert to the native narrow string encoding, otherwise
         // native narrow string encoding is UTF8 and we are done.
@@ -534,8 +506,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
 }
 
 wstring
-IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
-                         const WstringConverterPtr& wConverter)
+IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter)
 {
     wstring target;
     if(!v.empty())
@@ -549,7 +520,7 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
         {
             UTF8BufferI buffer;
             Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
-            tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+            buffer.swap(tmp, last);
         }
         else
         {
@@ -577,7 +548,9 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv
     }
     UTF8BufferI buffer;
     Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
-    return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+    string result;
+    buffer.swap(result, last);
+    return result;
 }
 
 string
@@ -620,11 +593,7 @@ IceUtilInternal::toUTF16(const vector<Byte>& source)
 #ifdef ICE_HAS_CODECVT_UTF8
     assert(sizeof(Char16T) == sizeof(unsigned short));
 
-#ifdef ICE_LITTLE_ENDIAN
-    typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert;
-#else
     typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert;
-#endif
 
     Convert convert;
 
diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp
index 22ced7e61b2..4db36d29e9d 100644
--- a/cpp/src/IceUtil/Unicode.cpp
+++ b/cpp/src/IceUtil/Unicode.cpp
@@ -26,90 +26,80 @@ using namespace IceUtilInternal;
 
 namespace
 {
-        //
-        // Helper class, base never defined
-        // Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
-        //
-        template<size_t wcharSize>
-        struct WstringHelper
-        {
-                static ConversionResult toUTF8(
-                        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-                        Byte*& targetStart, Byte* targetEnd);
+//
+// Helper class, base never defined
+// Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
+//
+template<size_t wcharSize> struct WstringHelper;
 
-                static ConversionResult fromUTF8(
-                        const Byte*& sourceStart, const Byte* sourceEnd,
-                        wchar_t*& targetStart, wchar_t* targetEnd);
-        };
 
-        template<>
-        struct WstringHelper<2>
-        {
-                static ConversionResult toUTF8(
-                        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-                        Byte*& targetStart, Byte* targetEnd)
-                {
-                        return ConvertUTF16toUTF8(
-                                reinterpret_cast<const UTF16**>(&sourceStart),
-                                reinterpret_cast<const UTF16*>(sourceEnd),
-                                &targetStart, targetEnd, lenientConversion);
-                }
-
-                static ConversionResult fromUTF8(
-                        const Byte*& sourceStart, const Byte* sourceEnd,
-                        wchar_t*& targetStart, wchar_t* targetEnd)
-                {
-                        return ConvertUTF8toUTF16(
-                                &sourceStart, sourceEnd,
-                                reinterpret_cast<UTF16**>(&targetStart),
-                                reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
-                }
-        };
-
-        template<>
-        struct WstringHelper<4>
-        {
-                static ConversionResult toUTF8(
-                        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-                        Byte*& targetStart, Byte* targetEnd)
-                {
-                        return ConvertUTF32toUTF8(
-                                reinterpret_cast<const UTF32**>(&sourceStart),
-                                reinterpret_cast<const UTF32*>(sourceEnd),
-                                &targetStart, targetEnd, lenientConversion);
-                }
-
-                static ConversionResult fromUTF8(
-                        const Byte*& sourceStart, const Byte* sourceEnd,
-                        wchar_t*& targetStart, wchar_t* targetEnd)
-                {
-                        return ConvertUTF8toUTF32(
-                                &sourceStart, sourceEnd,
-                                reinterpret_cast<UTF32**>(&targetStart),
-                                reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
-                }
-        };
-
-        void
-                checkResult(ConversionResult result)
+template<>
+struct WstringHelper<2>
+{
+    static ConversionResult toUTF8(
+        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+        Byte*& targetStart, Byte* targetEnd)
+    {
+        return ConvertUTF16toUTF8(
+            reinterpret_cast<const UTF16**>(&sourceStart),
+            reinterpret_cast<const UTF16*>(sourceEnd),
+            &targetStart, targetEnd, lenientConversion);
+    }
+
+    static ConversionResult fromUTF8(
+        const Byte*& sourceStart, const Byte* sourceEnd,
+        wchar_t*& targetStart, wchar_t* targetEnd)
+    {
+        return ConvertUTF8toUTF16(
+            &sourceStart, sourceEnd,
+            reinterpret_cast<UTF16**>(&targetStart),
+            reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
+    }
+};
+
+template<>
+struct WstringHelper<4>
+{
+    static ConversionResult toUTF8(
+        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+        Byte*& targetStart, Byte* targetEnd)
+    {
+        return ConvertUTF32toUTF8(
+            reinterpret_cast<const UTF32**>(&sourceStart),
+            reinterpret_cast<const UTF32*>(sourceEnd),
+            &targetStart, targetEnd, lenientConversion);
+    }
+
+    static ConversionResult fromUTF8(
+        const Byte*& sourceStart, const Byte* sourceEnd,
+        wchar_t*& targetStart, wchar_t* targetEnd)
+    {
+        return ConvertUTF8toUTF32(
+            &sourceStart, sourceEnd,
+            reinterpret_cast<UTF32**>(&targetStart),
+            reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
+    }
+};
+
+void checkResult(ConversionResult result)
+{
+    switch (result)
+    {
+        case conversionOK:
+            break;
+        case sourceExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
+        case sourceIllegal:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+        case targetExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+        default:
         {
-                switch (result)
-                {
-                case conversionOK:
-                        break;
-                case sourceExhausted:
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
-                case sourceIllegal:
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
-                case targetExhausted:
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
-                default:
-                {
-                        assert(0);
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
-                }
-                }
+            assert(0);
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
         }
+    }
+}
 }
 
 //
@@ -117,9 +107,8 @@ namespace
 //
 
 bool
-IceUtilInternal::convertUTFWstringToUTF8(
-    const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-    Byte*& targetStart, Byte* targetEnd)
+IceUtilInternal::convertUTFWstringToUTF8(const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+                                         Byte*& targetStart, Byte* targetEnd)
 {
     ConversionResult result = WstringHelper<sizeof(wchar_t)>::toUTF8(
         sourceStart, sourceEnd, targetStart, targetEnd);
@@ -135,30 +124,20 @@ IceUtilInternal::convertUTFWstringToUTF8(
     }
 }
 
-
 void
-IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd,
-                                         std::wstring& target)
+IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, std::wstring& target)
 {
-    //
-    // Could be reimplemented without this temporary wchar_t buffer
-    //
-    size_t size = static_cast<size_t>(sourceEnd - sourceStart);
-    wchar_t* outBuf = new wchar_t[size];
-    wchar_t* targetStart = outBuf;
-    wchar_t* targetEnd = targetStart + size;
-
-    ConversionResult result =
-        WstringHelper<sizeof(wchar_t)>::fromUTF8(
-            sourceStart, sourceEnd, targetStart, targetEnd);
-
-    if(result == conversionOK)
-    {
-        std::wstring s(outBuf, static_cast<size_t>(targetStart - outBuf));
-        s.swap(target);
-    }
-    delete[] outBuf;
+    size_t sourceSize = static_cast<size_t>(sourceEnd - sourceStart);
+
+    target.resize(sourceSize);
+    wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+    wchar_t* targetEnd = targetStart + sourceSize;
+
+    ConversionResult result = WstringHelper<sizeof(wchar_t)>::fromUTF8(sourceStart, sourceEnd,
+                                                                       targetStart, targetEnd);
+
     checkResult(result);
+    target.resize(targetStart - target.data());
 }
 
 void
diff --git a/cpp/test/IceUtil/unicode/Client.cpp b/cpp/test/IceUtil/unicode/Client.cpp
index b3d3912057b..64c8fe5f0dd 100644
--- a/cpp/test/IceUtil/unicode/Client.cpp
+++ b/cpp/test/IceUtil/unicode/Client.cpp
@@ -17,6 +17,9 @@
 #endif
 #include <fstream>
 
+// Uncomment to include performance testing
+//#define TEST_PERF
+
 using namespace IceUtil;
 using namespace std;
 
@@ -45,14 +48,14 @@ main(int argc, char* argv[])
 #ifdef _WIN32
 
 #   ifdef __MINGW32__
-	dir = argv[1];
+        dir = argv[1];
 #   else
-	dir = IceUtil::wstringToString(argv[1]);
+        dir = IceUtil::wstringToString(argv[1]);
 #   endif
-	dir += "\\";
+        dir += "\\";
 #else
-	dir = argv[1];
-	dir += "/";
+        dir = argv[1];
+        dir += "/";
 #endif
     }
 
@@ -67,226 +70,264 @@ main(int argc, char* argv[])
     string wcoeurFile = string("coeur.") + wstringEncoding;
 
     {
-	cout << "testing UTF-8 to wstring (" << wstringEncoding << ") conversion... ";
-	ifstream is((dir + "coeur.utf8").c_str());
-	test(is.good());
-	ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
-	test(bis.good());
-
-	int lineNumber = 0;
-
-	do
-	{
-	    string line;
-	    getline(is, line, '\n');
-	    lineNumber++;
-	    wstring wline = stringToWstring(line);
-
-	    for(size_t i = 0; i < wline.length(); ++i)
-	    {
-		wchar_t wc = wline[i];
-		const char* buffer = reinterpret_cast<char*>(&wc);
-		for(size_t j = 0; j < sizeof(wchar_t); ++j)
-		{
-		    test(bis.good());
-		    char c;
-		    bis.get(c);
-		    if(buffer[j] != c)
-		    {
-			cerr << "Error at line " << lineNumber << " column " << i << endl;
-			cerr << "buffer[j] == " << hex << (int)static_cast<unsigned char>(buffer[j]) << endl;
-			cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
-		    }
-		    test(buffer[j] == c);
-		}
-	    }
-	    //
-	    // Skip newline character (Unix-style newline)
-	    //
-	    if(is.good())
-	    {
-		for(size_t j = 0; j < sizeof(wchar_t); ++j)
-		{
-		    test(bis.good());
-		    char c;
-		    bis.get(c);
-		}
-	    }
-	    else
-	    {
-		char c;
-		bis.get(c);
-		test(bis.eof());
-	    }
-	} while(is.good());
-
-	cout << "ok" << endl;
+        cout << "testing UTF-8 to wstring (" << wstringEncoding << ") conversion... ";
+        ifstream is((dir + "coeur.utf8").c_str());
+        test(is.good());
+        ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
+        test(bis.good());
+
+        int lineNumber = 0;
+
+        do
+        {
+            string line;
+            getline(is, line, '\n');
+            lineNumber++;
+            wstring wline = stringToWstring(line);
+
+            for(size_t i = 0; i < wline.length(); ++i)
+            {
+                wchar_t wc = wline[i];
+                const char* buffer = reinterpret_cast<char*>(&wc);
+                for(size_t j = 0; j < sizeof(wchar_t); ++j)
+                {
+                    test(bis.good());
+                    char c;
+                    bis.get(c);
+                    if(buffer[j] != c)
+                    {
+                        cerr << "Error at line " << lineNumber << " column " << i << endl;
+                        cerr << "buffer[j] == " << hex << (int)static_cast<unsigned char>(buffer[j]) << endl;
+                        cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
+                    }
+                    test(buffer[j] == c);
+                }
+            }
+            //
+            // Skip newline character (Unix-style newline)
+            //
+            if(is.good())
+            {
+                for(size_t j = 0; j < sizeof(wchar_t); ++j)
+                {
+                    test(bis.good());
+                    char c;
+                    bis.get(c);
+                }
+            }
+            else
+            {
+                char c;
+                bis.get(c);
+                test(bis.eof());
+            }
+        } while(is.good());
+
+        cout << "ok" << endl;
+    }
+
+    {
+        cout << "testing wstring (" << wstringEncoding << ") to UTF-8 conversion... ";
+
+        ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
+        test(bis.good());
+
+        wstring ws;
+        char c;
+
+        do
+        {
+            wchar_t wc;
+            char* buffer = reinterpret_cast<char*>(&wc);
+
+            for(size_t j = 0; j < sizeof(wchar_t); ++j)
+            {
+                if(!bis.good())
+                {
+                    break;
+                }
+                bis.get(c);
+                buffer[j] = c;
+            }
+
+            if(bis.good())
+            {
+                ws.push_back(wc);
+            }
+        } while(bis.good());
+
+        string s = wstringToString(ws);
+
+        ifstream nbis((dir + "coeur.utf8").c_str(), ios_base::binary);
+        test(nbis.good());
+
+        for(size_t i = 0; i < s.size(); ++i)
+        {
+            test(nbis.good());
+            nbis.get(c);
+            char ci = s[i];
+
+            if(c != ci)
+            {
+                cerr << "i == " << i << endl;
+                cerr << "ci == " << hex << (int)static_cast<unsigned char>(ci) << endl;
+                cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
+            }
+            test(c == s[i]);
+        }
+        test(!nbis.eof());
+        nbis.get(c);
+        test(nbis.eof());
+
+        cout << "ok" << endl;
     }
 
     {
-	cout << "testing wstring (" << wstringEncoding << ") to UTF-8 conversion... ";
-
-	ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary);
-	test(bis.good());
-
-	wstring ws;
-	char c;
-
-	do
-	{
-	    wchar_t wc;
-	    char* buffer = reinterpret_cast<char*>(&wc);
-
-	    for(size_t j = 0; j < sizeof(wchar_t); ++j)
-	    {
-		if(!bis.good())
-		{
-		    break;
-		}
-		bis.get(c);
-		buffer[j] = c;
-	    }
-
-	    if(bis.good())
-	    {
-		ws.push_back(wc);
-	    }
-	} while(bis.good());
-
-	string s = wstringToString(ws);
-
-	ifstream nbis((dir + "coeur.utf8").c_str(), ios_base::binary);
-	test(nbis.good());
-
-	for(size_t i = 0; i < s.size(); ++i)
-	{
-	    test(nbis.good());
-	    nbis.get(c);
-	    char ci = s[i];
-
-	    if(c != ci)
-	    {
-		cerr << "i == " << i << endl;
-		cerr << "ci == " << hex << (int)static_cast<unsigned char>(ci) << endl;
-		cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl;
-	    }
-	    test(c == s[i]);
-	}
-	test(!nbis.eof());
-	nbis.get(c);
-	test(nbis.eof());
-
-	cout << "ok" << endl;
+        cout << "testing wstring with surrogates... ";
+
+        //
+        // Euro sign (U+20AC) is encoded with 1 UTF-16 code unit, and 3 UTF-8 code units
+        // U+10437 is a Deseret character, encoded with 2 UTF-16 code units, and 4 UTF-8 code units
+        //
+        wstring ws = L"\u20ac\u20ac\U00010437";
+
+        if(sizeof(wchar_t) == 2)
+        {
+            test(ws.length() == 4);
+        }
+        else
+        {
+            test(sizeof(wchar_t) == 4);
+            test(ws.length() == 3);
+        }
+
+        //
+        // The Unicode string converter implementation allocates an initial buffer
+        // of size max(2 * (sourceEnd - sourceStart), 4).
+        // With UTF-16 encoding, that's 8 and the first 2 euros will use the first 6
+        // bytes of the initial buffer.
+
+        string ns = IceUtil::wstringToString(ws);
+
+        const string good = "\xE2\x82\xAC\xE2\x82\xAC\xF0\x90\x90\xB7";
+        test(ns == good);
+        test(ws == IceUtil::stringToWstring(ns));
+
+        cout << "ok" << endl;
+
+        cout << "testing IceUtilInternal::toUTF16, toUTF32 and fromUTF32... ";
+
+        vector<Byte> u8 = vector<Byte>(reinterpret_cast<const Byte*>(ns.data()),
+                                       reinterpret_cast<const Byte*>(ns.data() + ns.length()));
+
+        vector<unsigned short> u16 = IceUtilInternal::toUTF16(u8);
+        test(u16.size() == 4);
+        test(u16[0] == 0x20ac);
+        test(u16[1] == 0x20ac);
+        test(u16[2] == 0xd801);
+        test(u16[3] == 0xdc37);
+
+        vector<unsigned int> u32 = IceUtilInternal::toUTF32(u8);
+        test(u32.size() == 3);
+        test(u32[0] == 0x20ac);
+        test(u32[1] == 0x20ac);
+        test(u32[2] == 0x10437);
+
+        vector<Byte> nu8 = IceUtilInternal::fromUTF32(u32);
+        test(nu8 == u8);
+
+        cout << "ok" << endl;
     }
 
+#ifdef TEST_PERF
     {
-	cout << "testing wstring with surrogates... ";
-
-	//
-	// Euro sign (U+20AC) is encoded with 1 UTF-16 code unit, and 3 UTF-8 code units
-	// U+10437 is a Deseret character, encoded with 2 UTF-16 code units, and 4 UTF-8 code units
-	//
-	wstring ws = L"\u20ac\u20ac\U00010437";
-
-	if(sizeof(wchar_t) == 2)
-	{
-	    test(ws.length() == 4);
-	}
-	else
-	{
-	    test(sizeof(wchar_t) == 4);
-	    test(ws.length() == 3);
-	}
-
-	//
-	// The Unicode string converter implementation allocates an initial buffer
-	// of size max(2 * (sourceEnd - sourceStart), 4).
-	// With UTF-16 encoding, that's 8 and the first 2 euros will use the first 6
-	// bytes of the initial buffer.
-
-	string ns = IceUtil::wstringToString(ws);
-
-	const string good = "\xE2\x82\xAC\xE2\x82\xAC\xF0\x90\x90\xB7";
-	test(ns == good);
-	test(ws == IceUtil::stringToWstring(ns));
-
-	vector<Byte> u8 = vector<Byte>(reinterpret_cast<const Byte*>(ns.data()),
-				       reinterpret_cast<const Byte*>(ns.data() + ns.length()));
-
-	vector<unsigned short> u16 = IceUtilInternal::toUTF16(u8);
-	test(u16.size() == 4);
-	test(u16[0] == 0x20ac);
-	test(u16[1] == 0x20ac);
-	test(u16[2] == 0xd801);
-	test(u16[3] == 0xdc37);
-
-	vector<unsigned int> u32 = IceUtilInternal::toUTF32(u8);
-	test(u32.size() == 3);
-	test(u32[0] == 0x20ac);
-	test(u32[1] == 0x20ac);
-	test(u32[2] == 0x10437);
-
-	vector<Byte> nu8 = IceUtilInternal::fromUTF32(u32);
-	test(nu8 == u8);
-
-	cout << "ok" << endl;
+        // The only performance-critical code is the UnicodeWstringConverter
+        // that is used whenever we marshal/unmarshal wstrings.
+
+        const long iterations = 5000000;
+        const wstring ws = L"abcdefghijklmnopqrstuvwxyz+\u20ac\u20ac\U00010437";
+        const string ns = IceUtil::wstringToString(ws);
+        test(IceUtil::stringToWstring(ns) == ws);
+
+        cout << "testing performance with " << iterations << " iterations... ";
+
+        IceUtil::Time toU8 = IceUtil::Time::now(IceUtil::Time::Monotonic);
+        for(long i = 0; i < iterations; ++i)
+        {
+            test(IceUtil::wstringToString(ws) == ns);
+        }
+        IceUtil::Time now = IceUtil::Time::now(IceUtil::Time::Monotonic);
+        toU8 = now - toU8;
+
+        IceUtil::Time fromU8 = now;
+        for(long i = 0; i < iterations; ++i)
+        {
+            test(IceUtil::stringToWstring(ns) == ws);
+        }
+        fromU8 = IceUtil::Time::now(IceUtil::Time::Monotonic) - fromU8;
+
+        cout << "toUTF8 = " << toU8 * 1000 << " ms; fromUTF8 = "
+             << fromU8 * 1000 << " ms ok" << endl;
     }
 
+#endif
+
+
     {
-	cout << "testing error handling... ";
-
-	// From http://stackoverflow.com/questions/1301402/example-invalid-utf8-string
-
-	string badUTF8[] = {
-	    "\xc3\x28",
-	    "\xa0\xa1",
-	    "\xe2\x28\xa1",
-	    "\xe2\x82\x28",
-	    "\xf0\x28\x8c\xbc",
-	    "\xf0\x90\x28\xbc",
-	    "\xf0\x28\x8c\x28",
-	    "\xf8\xa1\xa1\xa1\xa1",
-	    "\xfc\xa1\xa1\xa1\xa1\xa1",
-	    ""
-	};
-
-	for(size_t i = 0; badUTF8[i] != ""; ++i)
-	{
-	    try
-	    {
-		wstring ws = IceUtil::stringToWstring(badUTF8[i]);
-		wcerr << L"Unexpected: " << ws << endl;
-		test(false);
-	    }
-	    catch(const IceUtil::IllegalConversionException&)
-	    {}
-	}
-
-	// TODO: need test for bad UTF-32 strings
+        cout << "testing error handling... ";
+
+        // From http://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+
+        string badUTF8[] = {
+            "\xc3\x28",
+            "\xa0\xa1",
+            "\xe2\x28\xa1",
+            "\xe2\x82\x28",
+            "\xf0\x28\x8c\xbc",
+            "\xf0\x90\x28\xbc",
+            "\xf0\x28\x8c\x28",
+            "\xf8\xa1\xa1\xa1\xa1",
+            "\xfc\xa1\xa1\xa1\xa1\xa1",
+            ""
+        };
+
+        for(size_t i = 0; badUTF8[i] != ""; ++i)
+        {
+            try
+            {
+                wstring ws = IceUtil::stringToWstring(badUTF8[i]);
+                wcerr << L"Unexpected: " << ws << endl;
+                test(false);
+            }
+            catch(const IceUtil::IllegalConversionException&)
+            {}
+        }
+
+        // TODO: need test for bad UTF-32 strings
 #ifdef _WIN32
 
-	// Note: for an unknown reason, the conversion works without
-	// the extra letter (x below) when using codecvt_utf8_utf16.
-
-	wstring badWstring[] = {
-	    wstring(1, wchar_t(0xD800)) + L"x",
-	    wstring(2, wchar_t(0xDB7F)),
-	    L""
-	};
-
-	for(size_t i = 0; badWstring[i] != L""; ++i)
-	{
-	    try
-	    {
-		string s = IceUtil::wstringToString(badWstring[i]);
-		test(false);
-	    }
-	    catch(const IceUtil::IllegalConversionException&)
-	    {}
-	}
+        // Note: for an unknown reason, the conversion works without
+        // the extra letter (x below) when using codecvt_utf8_utf16.
+
+        wstring badWstring[] = {
+            wstring(1, wchar_t(0xD800)) + L"x",
+            wstring(2, wchar_t(0xDB7F)),
+            L""
+        };
+
+        for(size_t i = 0; badWstring[i] != L""; ++i)
+        {
+            try
+            {
+                string s = IceUtil::wstringToString(badWstring[i]);
+                test(false);
+            }
+            catch(const IceUtil::IllegalConversionException&)
+            {}
+        }
 #endif
 
-	cout << "ok" << endl;
+        cout << "ok" << endl;
 
     }
     return EXIT_SUCCESS;
author	Bernard Normier <bernard@zeroc.com>	2016-06-04 16:18:18 -0400
committer	Bernard Normier <bernard@zeroc.com>	2016-06-04 16:18:18 -0400
commit	a59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
tree	e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp
parent	More UTF tests (diff)
download	ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2 ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip