UnicodeWstringConverter performance improvement and cleanup

author: Bernard Normier <bernard@zeroc.com> 2016-06-04 16:18:18 -0400
committer: Bernard Normier <bernard@zeroc.com> 2016-06-04 16:18:18 -0400
commit: a59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
tree: e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp/src
parent: More UTF tests (diff)
download: ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip
2 files changed, 129 insertions, 181 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
index 8b60c48d53f..cb15037ecfa 100644
--- a/cpp/src/IceUtil/StringConverter.cpp
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -43,11 +43,7 @@ struct SelectCodeCvt;
 template<>
 struct SelectCodeCvt<2>
 {
-#ifdef ICE_LITTLE_ENDIAN
-    typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type;
-#else
     typedef std::codecvt_utf8_utf16<wchar_t> Type;
-#endif
 };
 
 template<>
@@ -155,28 +151,35 @@ public:
 
     virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
     {
-        if(sourceStart == sourceEnd)
+        const size_t sourceSize = sourceEnd - sourceStart;
+
+        if(sourceSize == 0)
         {
             target = L"";
         }
         else
         {
-            //
-            // TODO: consider reimplementing without the wstring_convert helper
-            // to improve performance
-            // Note that wstring_convert is "stateful" and cannot be a shared data member
-            //
-            wstring_convert<CodeCvt> convert;
-
-            try
-            {
-                target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart),
-                                            reinterpret_cast<const char*>(sourceEnd));
-            }
-            catch(const std::range_error& ex)
+            target.resize(sourceSize);
+            wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+            wchar_t* targetEnd = targetStart + sourceSize;
+            wchar_t* targetNext = targetStart;
+
+            const char* sourceNext = reinterpret_cast<const char*>(sourceStart);
+
+            mbstate_t state = mbstate_t();
+
+            codecvt_base::result result = _codecvt.in(state,
+                                                      reinterpret_cast<const char*>(sourceStart),
+                                                      reinterpret_cast<const char*>(sourceEnd),
+                                                      sourceNext,
+                                                      targetStart, targetEnd, targetNext);
+
+            if(result != codecvt_base::ok)
             {
-                throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+                throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure");
             }
+
+            target.resize(targetNext - targetStart);
         }
     }
 
@@ -215,14 +218,12 @@ public:
 
             targetStart = buffer.getMoreBytes(chunkSize, targetStart);
             targetEnd = targetStart + chunkSize;
-
         }
         while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false);
 
         return targetStart;
     }
 
-
     virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
     {
         if(sourceStart == sourceEnd)
@@ -290,67 +291,38 @@ getUnicodeWstringConverter()
     return unicodeWstringConverter;
 }
 
-
 class UTF8BufferI : public UTF8Buffer
 {
 public:
 
-    UTF8BufferI() :
-        _buffer(0),
-        _offset(0)
-    {
-    }
-
-    ~UTF8BufferI()
-    {
-        free(_buffer);
-    }
-
+    //
+    // Returns the first unused byte in the resized buffer
+    // 
     Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
     {
-        if(_buffer == 0)
-        {
-            _buffer = static_cast<Byte*>(malloc(howMany));
-            if(!_buffer)
-            {
-                throw std::bad_alloc();
-            }
-        }
-        else
+        size_t bytesUsed = 0;
+        if(firstUnused != 0)
         {
-            assert(firstUnused != 0);
-            _offset = firstUnused - _buffer;
-            Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
-            if(!newBuffer)
-            {
-                reset();
-                throw std::bad_alloc();
-            }
-            else
-            {
-                _buffer = newBuffer;
-            }
+            bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data());
         }
 
-        return _buffer + _offset;
-    }
-
-    Byte* getBuffer()
-    {
-        return _buffer;
+        if(_buffer.size() < howMany + bytesUsed)
+        {
+            _buffer.resize(bytesUsed + howMany);
+        } 
+        
+        return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed;
     }
 
-    void reset()
+    void swap(string& other, const Byte* tail)
     {
-        free(_buffer);
-        _buffer = 0;
-        _offset = 0;
+        assert(tail >= reinterpret_cast<const Byte*>(_buffer.data()));
+        _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data()));
+        other.swap(_buffer);
     }
 
 private:
-
-    Byte* _buffer;
-    size_t _offset;
+    string _buffer;
 };
 
 #ifdef _WIN32
@@ -516,8 +488,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
         //
         UTF8BufferI buffer;
         Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer);
-        target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
-
+        buffer.swap(target, last);
+       
         //
         // If narrow string converter is present convert to the native narrow string encoding, otherwise
         // native narrow string encoding is UTF8 and we are done.
@@ -534,8 +506,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
 }
 
 wstring
-IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
-                         const WstringConverterPtr& wConverter)
+IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter)
 {
     wstring target;
     if(!v.empty())
@@ -549,7 +520,7 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
         {
             UTF8BufferI buffer;
             Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
-            tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+            buffer.swap(tmp, last);
         }
         else
         {
@@ -577,7 +548,9 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv
     }
     UTF8BufferI buffer;
     Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
-    return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+    string result;
+    buffer.swap(result, last);
+    return result;
 }
 
 string
@@ -620,11 +593,7 @@ IceUtilInternal::toUTF16(const vector<Byte>& source)
 #ifdef ICE_HAS_CODECVT_UTF8
     assert(sizeof(Char16T) == sizeof(unsigned short));
 
-#ifdef ICE_LITTLE_ENDIAN
-    typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert;
-#else
     typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert;
-#endif
 
     Convert convert;
 
diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp
index 22ced7e61b2..4db36d29e9d 100644
--- a/cpp/src/IceUtil/Unicode.cpp
+++ b/cpp/src/IceUtil/Unicode.cpp
@@ -26,90 +26,80 @@ using namespace IceUtilInternal;
 
 namespace
 {
-        //
-        // Helper class, base never defined
-        // Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
-        //
-        template<size_t wcharSize>
-        struct WstringHelper
-        {
-                static ConversionResult toUTF8(
-                        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-                        Byte*& targetStart, Byte* targetEnd);
+//
+// Helper class, base never defined
+// Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
+//
+template<size_t wcharSize> struct WstringHelper;
 
-                static ConversionResult fromUTF8(
-                        const Byte*& sourceStart, const Byte* sourceEnd,
-                        wchar_t*& targetStart, wchar_t* targetEnd);
-        };
 
-        template<>
-        struct WstringHelper<2>
-        {
-                static ConversionResult toUTF8(
-                        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-                        Byte*& targetStart, Byte* targetEnd)
-                {
-                        return ConvertUTF16toUTF8(
-                                reinterpret_cast<const UTF16**>(&sourceStart),
-                                reinterpret_cast<const UTF16*>(sourceEnd),
-                                &targetStart, targetEnd, lenientConversion);
-                }
-
-                static ConversionResult fromUTF8(
-                        const Byte*& sourceStart, const Byte* sourceEnd,
-                        wchar_t*& targetStart, wchar_t* targetEnd)
-                {
-                        return ConvertUTF8toUTF16(
-                                &sourceStart, sourceEnd,
-                                reinterpret_cast<UTF16**>(&targetStart),
-                                reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
-                }
-        };
-
-        template<>
-        struct WstringHelper<4>
-        {
-                static ConversionResult toUTF8(
-                        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-                        Byte*& targetStart, Byte* targetEnd)
-                {
-                        return ConvertUTF32toUTF8(
-                                reinterpret_cast<const UTF32**>(&sourceStart),
-                                reinterpret_cast<const UTF32*>(sourceEnd),
-                                &targetStart, targetEnd, lenientConversion);
-                }
-
-                static ConversionResult fromUTF8(
-                        const Byte*& sourceStart, const Byte* sourceEnd,
-                        wchar_t*& targetStart, wchar_t* targetEnd)
-                {
-                        return ConvertUTF8toUTF32(
-                                &sourceStart, sourceEnd,
-                                reinterpret_cast<UTF32**>(&targetStart),
-                                reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
-                }
-        };
-
-        void
-                checkResult(ConversionResult result)
+template<>
+struct WstringHelper<2>
+{
+    static ConversionResult toUTF8(
+        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+        Byte*& targetStart, Byte* targetEnd)
+    {
+        return ConvertUTF16toUTF8(
+            reinterpret_cast<const UTF16**>(&sourceStart),
+            reinterpret_cast<const UTF16*>(sourceEnd),
+            &targetStart, targetEnd, lenientConversion);
+    }
+
+    static ConversionResult fromUTF8(
+        const Byte*& sourceStart, const Byte* sourceEnd,
+        wchar_t*& targetStart, wchar_t* targetEnd)
+    {
+        return ConvertUTF8toUTF16(
+            &sourceStart, sourceEnd,
+            reinterpret_cast<UTF16**>(&targetStart),
+            reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
+    }
+};
+
+template<>
+struct WstringHelper<4>
+{
+    static ConversionResult toUTF8(
+        const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+        Byte*& targetStart, Byte* targetEnd)
+    {
+        return ConvertUTF32toUTF8(
+            reinterpret_cast<const UTF32**>(&sourceStart),
+            reinterpret_cast<const UTF32*>(sourceEnd),
+            &targetStart, targetEnd, lenientConversion);
+    }
+
+    static ConversionResult fromUTF8(
+        const Byte*& sourceStart, const Byte* sourceEnd,
+        wchar_t*& targetStart, wchar_t* targetEnd)
+    {
+        return ConvertUTF8toUTF32(
+            &sourceStart, sourceEnd,
+            reinterpret_cast<UTF32**>(&targetStart),
+            reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
+    }
+};
+
+void checkResult(ConversionResult result)
+{
+    switch (result)
+    {
+        case conversionOK:
+            break;
+        case sourceExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
+        case sourceIllegal:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+        case targetExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+        default:
         {
-                switch (result)
-                {
-                case conversionOK:
-                        break;
-                case sourceExhausted:
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
-                case sourceIllegal:
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
-                case targetExhausted:
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
-                default:
-                {
-                        assert(0);
-                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
-                }
-                }
+            assert(0);
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
         }
+    }
+}
 }
 
 //
@@ -117,9 +107,8 @@ namespace
 //
 
 bool
-IceUtilInternal::convertUTFWstringToUTF8(
-    const wchar_t*& sourceStart, const wchar_t* sourceEnd,
-    Byte*& targetStart, Byte* targetEnd)
+IceUtilInternal::convertUTFWstringToUTF8(const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+                                         Byte*& targetStart, Byte* targetEnd)
 {
     ConversionResult result = WstringHelper<sizeof(wchar_t)>::toUTF8(
         sourceStart, sourceEnd, targetStart, targetEnd);
@@ -135,30 +124,20 @@ IceUtilInternal::convertUTFWstringToUTF8(
     }
 }
 
-
 void
-IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd,
-                                         std::wstring& target)
+IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, std::wstring& target)
 {
-    //
-    // Could be reimplemented without this temporary wchar_t buffer
-    //
-    size_t size = static_cast<size_t>(sourceEnd - sourceStart);
-    wchar_t* outBuf = new wchar_t[size];
-    wchar_t* targetStart = outBuf;
-    wchar_t* targetEnd = targetStart + size;
-
-    ConversionResult result =
-        WstringHelper<sizeof(wchar_t)>::fromUTF8(
-            sourceStart, sourceEnd, targetStart, targetEnd);
-
-    if(result == conversionOK)
-    {
-        std::wstring s(outBuf, static_cast<size_t>(targetStart - outBuf));
-        s.swap(target);
-    }
-    delete[] outBuf;
+    size_t sourceSize = static_cast<size_t>(sourceEnd - sourceStart);
+
+    target.resize(sourceSize);
+    wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+    wchar_t* targetEnd = targetStart + sourceSize;
+
+    ConversionResult result = WstringHelper<sizeof(wchar_t)>::fromUTF8(sourceStart, sourceEnd,
+                                                                       targetStart, targetEnd);
+
     checkResult(result);
+    target.resize(targetStart - target.data());
 }
 
 void
author	Bernard Normier <bernard@zeroc.com>	2016-06-04 16:18:18 -0400
committer	Bernard Normier <bernard@zeroc.com>	2016-06-04 16:18:18 -0400
commit	a59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
tree	e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp/src
parent	More UTF tests (diff)
download	ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2 ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip