diff options
author | Bernard Normier <bernard@zeroc.com> | 2016-06-04 16:18:18 -0400 |
---|---|---|
committer | Bernard Normier <bernard@zeroc.com> | 2016-06-04 16:18:18 -0400 |
commit | a59bb01921429e8d6963d63c22b91c995d1c4631 (patch) | |
tree | e37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp/src | |
parent | More UTF tests (diff) | |
download | ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2 ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip |
UnicodeWstringConverter performance improvement and cleanup
Diffstat (limited to 'cpp/src')
-rw-r--r-- | cpp/src/IceUtil/StringConverter.cpp | 123 | ||||
-rw-r--r-- | cpp/src/IceUtil/Unicode.cpp | 187 |
2 files changed, 129 insertions, 181 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp index 8b60c48d53f..cb15037ecfa 100644 --- a/cpp/src/IceUtil/StringConverter.cpp +++ b/cpp/src/IceUtil/StringConverter.cpp @@ -43,11 +43,7 @@ struct SelectCodeCvt; template<> struct SelectCodeCvt<2> { -#ifdef ICE_LITTLE_ENDIAN - typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type; -#else typedef std::codecvt_utf8_utf16<wchar_t> Type; -#endif }; template<> @@ -155,28 +151,35 @@ public: virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { - if(sourceStart == sourceEnd) + const size_t sourceSize = sourceEnd - sourceStart; + + if(sourceSize == 0) { target = L""; } else { - // - // TODO: consider reimplementing without the wstring_convert helper - // to improve performance - // Note that wstring_convert is "stateful" and cannot be a shared data member - // - wstring_convert<CodeCvt> convert; - - try - { - target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart), - reinterpret_cast<const char*>(sourceEnd)); - } - catch(const std::range_error& ex) + target.resize(sourceSize); + wchar_t* targetStart = const_cast<wchar_t*>(target.data()); + wchar_t* targetEnd = targetStart + sourceSize; + wchar_t* targetNext = targetStart; + + const char* sourceNext = reinterpret_cast<const char*>(sourceStart); + + mbstate_t state = mbstate_t(); + + codecvt_base::result result = _codecvt.in(state, + reinterpret_cast<const char*>(sourceStart), + reinterpret_cast<const char*>(sourceEnd), + sourceNext, + targetStart, targetEnd, targetNext); + + if(result != codecvt_base::ok) { - throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure"); } + + target.resize(targetNext - targetStart); } } @@ -215,14 +218,12 @@ public: targetStart = buffer.getMoreBytes(chunkSize, targetStart); targetEnd = targetStart + chunkSize; - } while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false); return targetStart; } - virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { if(sourceStart == sourceEnd) @@ -290,67 +291,38 @@ getUnicodeWstringConverter() return unicodeWstringConverter; } - class UTF8BufferI : public UTF8Buffer { public: - UTF8BufferI() : - _buffer(0), - _offset(0) - { - } - - ~UTF8BufferI() - { - free(_buffer); - } - + // + // Returns the first unused byte in the resized buffer + // Byte* getMoreBytes(size_t howMany, Byte* firstUnused) { - if(_buffer == 0) - { - _buffer = static_cast<Byte*>(malloc(howMany)); - if(!_buffer) - { - throw std::bad_alloc(); - } - } - else + size_t bytesUsed = 0; + if(firstUnused != 0) { - assert(firstUnused != 0); - _offset = firstUnused - _buffer; - Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany)); - if(!newBuffer) - { - reset(); - throw std::bad_alloc(); - } - else - { - _buffer = newBuffer; - } + bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data()); } - return _buffer + _offset; - } - - Byte* getBuffer() - { - return _buffer; + if(_buffer.size() < howMany + bytesUsed) + { + _buffer.resize(bytesUsed + howMany); + } + + return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed; } - void reset() + void swap(string& other, const Byte* tail) { - free(_buffer); - _buffer = 0; - _offset = 0; + assert(tail >= reinterpret_cast<const Byte*>(_buffer.data())); + _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data())); + other.swap(_buffer); } private: - - Byte* _buffer; - size_t _offset; + string _buffer; }; #ifdef _WIN32 @@ -516,8 +488,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, // UTF8BufferI buffer; Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer); - target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); - + buffer.swap(target, last); + // // If narrow string converter is present convert to the native narrow string encoding, otherwise // native narrow string encoding is UTF8 and we are done. @@ -534,8 +506,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, } wstring -IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, - const WstringConverterPtr& wConverter) +IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { wstring target; if(!v.empty()) @@ -549,7 +520,7 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, { UTF8BufferI buffer; Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer); - tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); + buffer.swap(tmp, last); } else { @@ -577,7 +548,9 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv } UTF8BufferI buffer; Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer); - return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); + string result; + buffer.swap(result, last); + return result; } string @@ -620,11 +593,7 @@ IceUtilInternal::toUTF16(const vector<Byte>& source) #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char16T) == sizeof(unsigned short)); -#ifdef ICE_LITTLE_ENDIAN - typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert; -#else typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert; -#endif Convert convert; diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp index 22ced7e61b2..4db36d29e9d 100644 --- a/cpp/src/IceUtil/Unicode.cpp +++ b/cpp/src/IceUtil/Unicode.cpp @@ -26,90 +26,80 @@ using namespace IceUtilInternal; namespace { - // - // Helper class, base never defined - // Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8. - // - template<size_t wcharSize> - struct WstringHelper - { - static ConversionResult toUTF8( - const wchar_t*& sourceStart, const wchar_t* sourceEnd, - Byte*& targetStart, Byte* targetEnd); +// +// Helper class, base never defined +// Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8. +// +template<size_t wcharSize> struct WstringHelper; - static ConversionResult fromUTF8( - const Byte*& sourceStart, const Byte* sourceEnd, - wchar_t*& targetStart, wchar_t* targetEnd); - }; - template<> - struct WstringHelper<2> - { - static ConversionResult toUTF8( - const wchar_t*& sourceStart, const wchar_t* sourceEnd, - Byte*& targetStart, Byte* targetEnd) - { - return ConvertUTF16toUTF8( - reinterpret_cast<const UTF16**>(&sourceStart), - reinterpret_cast<const UTF16*>(sourceEnd), - &targetStart, targetEnd, lenientConversion); - } - - static ConversionResult fromUTF8( - const Byte*& sourceStart, const Byte* sourceEnd, - wchar_t*& targetStart, wchar_t* targetEnd) - { - return ConvertUTF8toUTF16( - &sourceStart, sourceEnd, - reinterpret_cast<UTF16**>(&targetStart), - reinterpret_cast<UTF16*>(targetEnd), lenientConversion); - } - }; - - template<> - struct WstringHelper<4> - { - static ConversionResult toUTF8( - const wchar_t*& sourceStart, const wchar_t* sourceEnd, - Byte*& targetStart, Byte* targetEnd) - { - return ConvertUTF32toUTF8( - reinterpret_cast<const UTF32**>(&sourceStart), - reinterpret_cast<const UTF32*>(sourceEnd), - &targetStart, targetEnd, lenientConversion); - } - - static ConversionResult fromUTF8( - const Byte*& sourceStart, const Byte* sourceEnd, - wchar_t*& targetStart, wchar_t* targetEnd) - { - return ConvertUTF8toUTF32( - &sourceStart, sourceEnd, - reinterpret_cast<UTF32**>(&targetStart), - reinterpret_cast<UTF32*>(targetEnd), lenientConversion); - } - }; - - void - checkResult(ConversionResult result) +template<> +struct WstringHelper<2> +{ + static ConversionResult toUTF8( + const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd) + { + return ConvertUTF16toUTF8( + reinterpret_cast<const UTF16**>(&sourceStart), + reinterpret_cast<const UTF16*>(sourceEnd), + &targetStart, targetEnd, lenientConversion); + } + + static ConversionResult fromUTF8( + const Byte*& sourceStart, const Byte* sourceEnd, + wchar_t*& targetStart, wchar_t* targetEnd) + { + return ConvertUTF8toUTF16( + &sourceStart, sourceEnd, + reinterpret_cast<UTF16**>(&targetStart), + reinterpret_cast<UTF16*>(targetEnd), lenientConversion); + } +}; + +template<> +struct WstringHelper<4> +{ + static ConversionResult toUTF8( + const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd) + { + return ConvertUTF32toUTF8( + reinterpret_cast<const UTF32**>(&sourceStart), + reinterpret_cast<const UTF32*>(sourceEnd), + &targetStart, targetEnd, lenientConversion); + } + + static ConversionResult fromUTF8( + const Byte*& sourceStart, const Byte* sourceEnd, + wchar_t*& targetStart, wchar_t* targetEnd) + { + return ConvertUTF8toUTF32( + &sourceStart, sourceEnd, + reinterpret_cast<UTF32**>(&targetStart), + reinterpret_cast<UTF32*>(targetEnd), lenientConversion); + } +}; + +void checkResult(ConversionResult result) +{ + switch (result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal"); + case targetExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal"); + default: { - switch (result) - { - case conversionOK: - break; - case sourceExhausted: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted"); - case sourceIllegal: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal"); - case targetExhausted: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal"); - default: - { - assert(0); - throw IceUtil::IllegalConversionException(__FILE__, __LINE__); - } - } + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); } + } +} } // @@ -117,9 +107,8 @@ namespace // bool -IceUtilInternal::convertUTFWstringToUTF8( - const wchar_t*& sourceStart, const wchar_t* sourceEnd, - Byte*& targetStart, Byte* targetEnd) +IceUtilInternal::convertUTFWstringToUTF8(const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd) { ConversionResult result = WstringHelper<sizeof(wchar_t)>::toUTF8( sourceStart, sourceEnd, targetStart, targetEnd); @@ -135,30 +124,20 @@ IceUtilInternal::convertUTFWstringToUTF8( } } - void -IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, - std::wstring& target) +IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, std::wstring& target) { - // - // Could be reimplemented without this temporary wchar_t buffer - // - size_t size = static_cast<size_t>(sourceEnd - sourceStart); - wchar_t* outBuf = new wchar_t[size]; - wchar_t* targetStart = outBuf; - wchar_t* targetEnd = targetStart + size; - - ConversionResult result = - WstringHelper<sizeof(wchar_t)>::fromUTF8( - sourceStart, sourceEnd, targetStart, targetEnd); - - if(result == conversionOK) - { - std::wstring s(outBuf, static_cast<size_t>(targetStart - outBuf)); - s.swap(target); - } - delete[] outBuf; + size_t sourceSize = static_cast<size_t>(sourceEnd - sourceStart); + + target.resize(sourceSize); + wchar_t* targetStart = const_cast<wchar_t*>(target.data()); + wchar_t* targetEnd = targetStart + sourceSize; + + ConversionResult result = WstringHelper<sizeof(wchar_t)>::fromUTF8(sourceStart, sourceEnd, + targetStart, targetEnd); + checkResult(result); + target.resize(targetStart - target.data()); } void |