UnicodeWstringConverter performance improvement and cleanup

author: Bernard Normier <bernard@zeroc.com> 2016-06-04 16:18:18 -0400
committer: Bernard Normier <bernard@zeroc.com> 2016-06-04 16:18:18 -0400
commit: a59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
tree: e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp/src/IceUtil/StringConverter.cpp
parent: More UTF tests (diff)
download: ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip
1 files changed, 46 insertions, 77 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
index 8b60c48d53f..cb15037ecfa 100644
--- a/cpp/src/IceUtil/StringConverter.cpp
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -43,11 +43,7 @@ struct SelectCodeCvt;
 template<>
 struct SelectCodeCvt<2>
 {
-#ifdef ICE_LITTLE_ENDIAN
-    typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type;
-#else
     typedef std::codecvt_utf8_utf16<wchar_t> Type;
-#endif
 };
 
 template<>
@@ -155,28 +151,35 @@ public:
 
     virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
     {
-        if(sourceStart == sourceEnd)
+        const size_t sourceSize = sourceEnd - sourceStart;
+
+        if(sourceSize == 0)
         {
             target = L"";
         }
         else
         {
-            //
-            // TODO: consider reimplementing without the wstring_convert helper
-            // to improve performance
-            // Note that wstring_convert is "stateful" and cannot be a shared data member
-            //
-            wstring_convert<CodeCvt> convert;
-
-            try
-            {
-                target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart),
-                                            reinterpret_cast<const char*>(sourceEnd));
-            }
-            catch(const std::range_error& ex)
+            target.resize(sourceSize);
+            wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+            wchar_t* targetEnd = targetStart + sourceSize;
+            wchar_t* targetNext = targetStart;
+
+            const char* sourceNext = reinterpret_cast<const char*>(sourceStart);
+
+            mbstate_t state = mbstate_t();
+
+            codecvt_base::result result = _codecvt.in(state,
+                                                      reinterpret_cast<const char*>(sourceStart),
+                                                      reinterpret_cast<const char*>(sourceEnd),
+                                                      sourceNext,
+                                                      targetStart, targetEnd, targetNext);
+
+            if(result != codecvt_base::ok)
             {
-                throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+                throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure");
             }
+
+            target.resize(targetNext - targetStart);
         }
     }
 
@@ -215,14 +218,12 @@ public:
 
             targetStart = buffer.getMoreBytes(chunkSize, targetStart);
             targetEnd = targetStart + chunkSize;
-
         }
         while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false);
 
         return targetStart;
     }
 
-
     virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
     {
         if(sourceStart == sourceEnd)
@@ -290,67 +291,38 @@ getUnicodeWstringConverter()
     return unicodeWstringConverter;
 }
 
-
 class UTF8BufferI : public UTF8Buffer
 {
 public:
 
-    UTF8BufferI() :
-        _buffer(0),
-        _offset(0)
-    {
-    }
-
-    ~UTF8BufferI()
-    {
-        free(_buffer);
-    }
-
+    //
+    // Returns the first unused byte in the resized buffer
+    // 
     Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
     {
-        if(_buffer == 0)
-        {
-            _buffer = static_cast<Byte*>(malloc(howMany));
-            if(!_buffer)
-            {
-                throw std::bad_alloc();
-            }
-        }
-        else
+        size_t bytesUsed = 0;
+        if(firstUnused != 0)
         {
-            assert(firstUnused != 0);
-            _offset = firstUnused - _buffer;
-            Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
-            if(!newBuffer)
-            {
-                reset();
-                throw std::bad_alloc();
-            }
-            else
-            {
-                _buffer = newBuffer;
-            }
+            bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data());
         }
 
-        return _buffer + _offset;
-    }
-
-    Byte* getBuffer()
-    {
-        return _buffer;
+        if(_buffer.size() < howMany + bytesUsed)
+        {
+            _buffer.resize(bytesUsed + howMany);
+        } 
+        
+        return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed;
     }
 
-    void reset()
+    void swap(string& other, const Byte* tail)
     {
-        free(_buffer);
-        _buffer = 0;
-        _offset = 0;
+        assert(tail >= reinterpret_cast<const Byte*>(_buffer.data()));
+        _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data()));
+        other.swap(_buffer);
     }
 
 private:
-
-    Byte* _buffer;
-    size_t _offset;
+    string _buffer;
 };
 
 #ifdef _WIN32
@@ -516,8 +488,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
         //
         UTF8BufferI buffer;
         Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer);
-        target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
-
+        buffer.swap(target, last);
+       
         //
         // If narrow string converter is present convert to the native narrow string encoding, otherwise
         // native narrow string encoding is UTF8 and we are done.
@@ -534,8 +506,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
 }
 
 wstring
-IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
-                         const WstringConverterPtr& wConverter)
+IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter)
 {
     wstring target;
     if(!v.empty())
@@ -549,7 +520,7 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
         {
             UTF8BufferI buffer;
             Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
-            tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+            buffer.swap(tmp, last);
         }
         else
         {
@@ -577,7 +548,9 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv
     }
     UTF8BufferI buffer;
     Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
-    return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+    string result;
+    buffer.swap(result, last);
+    return result;
 }
 
 string
@@ -620,11 +593,7 @@ IceUtilInternal::toUTF16(const vector<Byte>& source)
 #ifdef ICE_HAS_CODECVT_UTF8
     assert(sizeof(Char16T) == sizeof(unsigned short));
 
-#ifdef ICE_LITTLE_ENDIAN
-    typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert;
-#else
     typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert;
-#endif
 
     Convert convert;
author	Bernard Normier <bernard@zeroc.com>	2016-06-04 16:18:18 -0400
committer	Bernard Normier <bernard@zeroc.com>	2016-06-04 16:18:18 -0400
commit	a59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
tree	e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp/src/IceUtil/StringConverter.cpp
parent	More UTF tests (diff)
download	ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2 ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip