diff options
Diffstat (limited to 'cpp/src/IceUtil/StringConverter.cpp')
-rw-r--r-- | cpp/src/IceUtil/StringConverter.cpp | 586 |
1 files changed, 426 insertions, 160 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp index 46c590c42f3..8b60c48d53f 100644 --- a/cpp/src/IceUtil/StringConverter.cpp +++ b/cpp/src/IceUtil/StringConverter.cpp @@ -12,7 +12,13 @@ #include <IceUtil/Mutex.h> #include <IceUtil/ScopedArray.h> #include <IceUtil/StringUtil.h> + +#ifdef ICE_HAS_CODECVT_UTF8 +#include <codecvt> +#include <locale> +#else #include <IceUtil/Unicode.h> +#endif using namespace IceUtil; using namespace IceUtilInternal; @@ -25,6 +31,234 @@ IceUtil::Mutex* processStringConverterMutex = 0; IceUtil::StringConverterPtr processStringConverter; IceUtil::WstringConverterPtr processWstringConverter; +#ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC +IceUtil::WstringConverterPtr unicodeWstringConverter; +#endif + +#ifdef ICE_HAS_CODECVT_UTF8 + +template<size_t wcharSize> +struct SelectCodeCvt; + +template<> +struct SelectCodeCvt<2> +{ +#ifdef ICE_LITTLE_ENDIAN + typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type; +#else + typedef std::codecvt_utf8_utf16<wchar_t> Type; +#endif +}; + +template<> +struct SelectCodeCvt<4> +{ + typedef std::codecvt_utf8<wchar_t> Type; +}; + +class UnicodeWstringConverter : public WstringConverter +{ +public: + +#if defined(_MSC_VER) && (_MSC_VER <= 1800) + // + // VS 2013 needs a default ctor + // + UnicodeWstringConverter() + { + } +#endif + + virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const + { + // + // Max bytes for a character encoding in UTF-8 is 4, + // however MSVC returns 6 + // +#ifdef _MSC_VER + assert(_codecvt.max_length() == 4 || _codecvt.max_length() == 6); +#else + assert(_codecvt.max_length() == 4); +#endif + if(sourceStart == sourceEnd) + { + return buffer.getMoreBytes(1, 0); + } + + char* targetStart = 0; + char* targetEnd = 0; + char* targetNext = 0; + + mbstate_t state = mbstate_t(); // must be initialized! + const wchar_t* sourceNext = sourceStart; + + bool more = false; + + // + // The number of bytes we request from buffer for each remaining source character + // + size_t factor = 2; + + do + { + assert(factor <= 4); + const size_t chunkSize = std::max<size_t>((sourceEnd - sourceStart) * factor, 4); + ++factor; // at the next round, we'll allocate more bytes per remaining source character + + targetStart = reinterpret_cast<char*>(buffer.getMoreBytes(chunkSize, reinterpret_cast<Byte*>(targetNext))); + targetEnd = targetStart + chunkSize; + targetNext = targetStart; + + codecvt_base::result result = + _codecvt.out(state, sourceStart, sourceEnd, sourceNext, targetStart, targetEnd, targetNext); + + switch(result) + { + case codecvt_base::ok: + // + // MSVC returns ok when target is exhausted + // + more = sourceNext < sourceEnd; + break; + + case codecvt_base::partial: + // + // clang/libc++ and g++5 return partial when target is exhausted + // + more = true; + assert(sourceNext < sourceEnd); + break; + + case codecvt_base::noconv: + // + // Unexpected + // + assert(0); + throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out noconv"); + + default: + throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out error"); + } + + if(targetStart == targetNext) + { + // We didn't convert a single character + throw IllegalConversionException(__FILE__, __LINE__, + "no character converted by codecvt.out"); + } + + sourceStart = sourceNext; + } while (more); + + return reinterpret_cast<Byte*>(targetNext); + } + + virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const + { + if(sourceStart == sourceEnd) + { + target = L""; + } + else + { + // + // TODO: consider reimplementing without the wstring_convert helper + // to improve performance + // Note that wstring_convert is "stateful" and cannot be a shared data member + // + wstring_convert<CodeCvt> convert; + + try + { + target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart), + reinterpret_cast<const char*>(sourceEnd)); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + } + } + +private: + + typedef SelectCodeCvt<sizeof(wchar_t)>::Type CodeCvt; + const CodeCvt _codecvt; +}; + +#else + +class UnicodeWstringConverter : public WstringConverter +{ +public: + + virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const + { + if(sourceStart == sourceEnd) + { + return buffer.getMoreBytes(1, 0); + } + + Byte* targetStart = 0; + Byte* targetEnd = 0; + + // + // The number of bytes we request from buffer for each remaining source character + // + size_t factor = 2; + + do + { + assert(factor <= 4); + const size_t chunkSize = std::max<size_t>((sourceEnd - sourceStart) * factor, 4); + ++factor; // at the next round, we'll allocate more bytes per remaining source character + + targetStart = buffer.getMoreBytes(chunkSize, targetStart); + targetEnd = targetStart + chunkSize; + + } + while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false); + + return targetStart; + } + + + virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const + { + if(sourceStart == sourceEnd) + { + target = L""; + } + else + { + convertUTF8ToUTFWstring(sourceStart, sourceEnd, target); + } + } +}; + +#endif + +#ifdef _WIN32 + +// +// Converts to/from UTF-8 using MultiByteToWideChar and WideCharToMultiByte +// +class WindowsStringConverter : public StringConverter +{ +public: + + explicit WindowsStringConverter(unsigned int); + + virtual Byte* toUTF8(const char*, const char*, UTF8Buffer&) const; + + virtual void fromUTF8(const Byte*, const Byte*, string& target) const; + +private: + unsigned int _cp; +}; +#endif + + class Init { public: @@ -32,6 +266,9 @@ public: Init() { processStringConverterMutex = new IceUtil::Mutex; +#ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC + unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); +#endif } ~Init() @@ -43,10 +280,16 @@ public: Init init; -} -namespace +const WstringConverterPtr& +getUnicodeWstringConverter() { +#ifdef ICE_HAS_THREAD_SAFE_LOCAL_STATIC + static const WstringConverterPtr unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); +#endif + return unicodeWstringConverter; +} + class UTF8BufferI : public UTF8Buffer { @@ -68,18 +311,27 @@ public: if(_buffer == 0) { _buffer = static_cast<Byte*>(malloc(howMany)); + if(!_buffer) + { + throw std::bad_alloc(); + } } else { assert(firstUnused != 0); _offset = firstUnused - _buffer; - _buffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany)); - } - - if(!_buffer) - { - throw std::bad_alloc(); + Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany)); + if(!newBuffer) + { + reset(); + throw std::bad_alloc(); + } + else + { + _buffer = newBuffer; + } } + return _buffer + _offset; } @@ -94,94 +346,13 @@ public: _buffer = 0; _offset = 0; } - + private: - IceUtil::Byte* _buffer; + Byte* _buffer; size_t _offset; }; -} - - - -UnicodeWstringConverter::UnicodeWstringConverter(ConversionFlags flags) : - _conversionFlags(flags) -{ -} - -Byte* -UnicodeWstringConverter::toUTF8(const wchar_t* sourceStart, - const wchar_t* sourceEnd, - UTF8Buffer& buffer) const -{ - // - // The "chunk size" is the maximum of the number of characters in the - // source and 6 (== max bytes necessary to encode one Unicode character). - // - size_t chunkSize = std::max<size_t>(static_cast<size_t>(sourceEnd - sourceStart), 6); - - Byte* targetStart = buffer.getMoreBytes(chunkSize, 0); - Byte* targetEnd = targetStart + chunkSize; - - ConversionResult result; - - while((result = - convertUTFWstringToUTF8(sourceStart, sourceEnd, - targetStart, targetEnd, _conversionFlags)) - == targetExhausted) - { - targetStart = buffer.getMoreBytes(chunkSize, targetStart); - targetEnd = targetStart + chunkSize; - } - - switch(result) - { - case conversionOK: - break; - case sourceExhausted: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source exhausted"); - case sourceIllegal: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source illegal"); - default: - { - assert(0); - throw IceUtil::IllegalConversionException(__FILE__, __LINE__); - } - } - return targetStart; -} - - -void -UnicodeWstringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, - wstring& target) const -{ - if(sourceStart == sourceEnd) - { - target = L""; - return; - } - - ConversionResult result = - convertUTF8ToUTFWstring(sourceStart, sourceEnd, target, _conversionFlags); - - switch(result) - { - case conversionOK: - break; - case sourceExhausted: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source exhausted"); - case sourceIllegal: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source illegal"); - default: - { - assert(0); - throw IceUtil::IllegalConversionException(__FILE__, __LINE__); - } - } -} - #ifdef _WIN32 WindowsStringConverter::WindowsStringConverter(unsigned int cp) : _cp(cp) @@ -205,16 +376,16 @@ WindowsStringConverter::toUTF8(const char* sourceStart, int size = 0; int writtenWchar = 0; ScopedArray<wchar_t> wbuffer; - + // // The following code pages doesn't support MB_ERR_INVALID_CHARS flag // see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx // DWORD flags = (_cp == 50220 || _cp == 50221 || _cp == 50222 || - _cp == 50225 || _cp == 50227 || _cp == 50229 || + _cp == 50225 || _cp == 50227 || _cp == 50229 || _cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS; - + do { size = size == 0 ? sourceSize + 2 : 2 * size; @@ -232,7 +403,7 @@ WindowsStringConverter::toUTF8(const char* sourceStart, // // Then convert this UTF-16 wbuffer into UTF-8 // - return _unicodeWstringConverter.toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer); + return getUnicodeWstringConverter()->toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer); } void @@ -256,7 +427,7 @@ WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, // First convert to wstring (UTF-16) // wstring wtarget; - _unicodeWstringConverter.fromUTF8(sourceStart, sourceEnd, wtarget); + getUnicodeWstringConverter()->fromUTF8(sourceStart, sourceEnd, wtarget); // // WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and @@ -284,7 +455,23 @@ WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, target.assign(buffer.get(), writtenChar); } +#endif + +} + + +WstringConverterPtr +IceUtil::createUnicodeWstringConverter() +{ + return getUnicodeWstringConverter(); +} +#ifdef _WIN32 +StringConverterPtr +IceUtil::createWindowsStringConverter(unsigned int cp) +{ + return ICE_MAKE_SHARED(WindowsStringConverter, cp); +} #endif @@ -298,8 +485,8 @@ IceUtil::getProcessStringConverter() void IceUtil::setProcessStringConverter(const StringConverterPtr& converter) { - IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); - processStringConverter = converter; + IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); + processStringConverter = converter; } WstringConverterPtr @@ -312,61 +499,33 @@ IceUtil::getProcessWstringConverter() void IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter) { - IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); - processWstringConverter = converter; + IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); + processWstringConverter = converter; } string -IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter, - IceUtil::ConversionFlags flags) +IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { string target; if(!v.empty()) { + const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); + // - // First convert to UTF8 narrow string. + // First convert to UTF-8 narrow string. // - if(wConverter) - { - UTF8BufferI buffer; - Byte* last = wConverter->toUTF8(v.data(), v.data() + v.size(), buffer); - target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); - } - else - { - size_t size = v.size() * 4 * sizeof(char); - - Byte* outBuf = new Byte[size]; - Byte* targetStart = outBuf; - Byte* targetEnd = outBuf + size; - - const wchar_t* sourceStart = v.data(); - - ConversionResult cr = - convertUTFWstringToUTF8( - sourceStart, sourceStart + v.size(), - targetStart, targetEnd, flags); - - if(cr != conversionOK) - { - delete[] outBuf; - assert(cr == sourceExhausted || cr == sourceIllegal); - throw IllegalConversionException(__FILE__, __LINE__, - cr == sourceExhausted ? "partial character" : "bad encoding"); - } - - target = string(reinterpret_cast<char*>(outBuf), static_cast<size_t>(targetStart - outBuf)); - delete[] outBuf; - } + UTF8BufferI buffer; + Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer); + target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); // - // If narrow string converter is present convert to the native narrow string encoding, otherwise + // If narrow string converter is present convert to the native narrow string encoding, otherwise // native narrow string encoding is UTF8 and we are done. // if(converter) { string tmp; - converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()), + converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()), reinterpret_cast<const Byte*>(target.data() + target.size()), tmp); tmp.swap(target); } @@ -375,8 +534,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, } wstring -IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, - const WstringConverterPtr& wConverter, IceUtil::ConversionFlags flags) +IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, + const WstringConverterPtr& wConverter) { wstring target; if(!v.empty()) @@ -397,29 +556,14 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, tmp = v; } + const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); + // - // If there is a wide string converter use fromUTF8 to convert to the wide string native encoding. + // Convert from UTF-8 to the wide string encoding // - if(wConverter) - { - wConverter->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()), - reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target); - } - else - { - const Byte* sourceStart = reinterpret_cast<const Byte*>(tmp.data()); - - ConversionResult cr = - convertUTF8ToUTFWstring(sourceStart, sourceStart + tmp.size(), target, flags); + wConverterWithDefault->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()), + reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target); - if(cr != conversionOK) - { - assert(cr == sourceExhausted || cr == sourceIllegal); - - throw IllegalConversionException(__FILE__, __LINE__, - cr == sourceExhausted ? "partial character" : "bad encoding"); - } - } } return target; } @@ -430,7 +574,7 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv if(!converter || str.empty()) { return str; - } + } UTF8BufferI buffer; Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer); return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); @@ -448,3 +592,125 @@ IceUtil::UTF8ToNative(const string& str, const IceUtil::StringConverterPtr& conv reinterpret_cast<const Byte*>(str.data() + str.size()), tmp); return tmp; } + +#ifdef ICE_HAS_CODECVT_UTF8 + +#if defined(_MSC_VER) && (_MSC_VER == 1900) +// +// Workaround for compiler bug - see http://stackoverflow.com/questions/32055357 +// +typedef unsigned short Char16T; +typedef unsigned int Char32T; + +#else +typedef char16_t Char16T; +typedef char32_t Char32T; +#endif + +#endif + + +vector<unsigned short> +IceUtilInternal::toUTF16(const vector<Byte>& source) +{ + vector<unsigned short> result; + if(!source.empty()) + { + +#ifdef ICE_HAS_CODECVT_UTF8 + assert(sizeof(Char16T) == sizeof(unsigned short)); + +#ifdef ICE_LITTLE_ENDIAN + typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert; +#else + typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert; +#endif + + Convert convert; + + try + { + Convert::wide_string ws = convert.from_bytes(reinterpret_cast<const char*>(&source.front()), + reinterpret_cast<const char*>(&source.front() + source.size())); + + result = vector<unsigned short>(reinterpret_cast<const unsigned short*>(ws.data()), + reinterpret_cast<const unsigned short*>(ws.data()) + ws.length()); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + +#else + convertUTF8ToUTF16(source, result); +#endif + } + return result; +} + +vector<unsigned int> +IceUtilInternal::toUTF32(const vector<Byte>& source) +{ + vector<unsigned int> result; + if(!source.empty()) + { + +#ifdef ICE_HAS_CODECVT_UTF8 + assert(sizeof(Char32T) == sizeof(unsigned int)); + + typedef wstring_convert<codecvt_utf8<Char32T>, Char32T> Convert; + Convert convert; + + try + { + Convert::wide_string ws = convert.from_bytes(reinterpret_cast<const char*>(&source.front()), + reinterpret_cast<const char*>(&source.front() + source.size())); + + result = vector<unsigned int>(reinterpret_cast<const unsigned int*>(ws.data()), + reinterpret_cast<const unsigned int*>(ws.data()) + ws.length()); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + +#else + convertUTF8ToUTF32(source, result); +#endif + } + return result; +} + + +vector<Byte> +IceUtilInternal::fromUTF32(const vector<unsigned int>& source) +{ + vector<Byte> result; + if(!source.empty()) + { + +#ifdef ICE_HAS_CODECVT_UTF8 + assert(sizeof(Char32T) == sizeof(unsigned int)); + + typedef wstring_convert<codecvt_utf8<Char32T>, Char32T> Convert; + Convert convert; + + try + { + Convert::byte_string bs = convert.to_bytes(reinterpret_cast<const Char32T*>(&source.front()), + reinterpret_cast<const Char32T*>(&source.front() + source.size())); + + result = vector<Byte>(reinterpret_cast<const Byte*>(bs.data()), + reinterpret_cast<const Byte*>(bs.data()) + bs.length()); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + +#else + convertUTF32ToUTF8(source, result); +#endif + } + return result; +} |