diff options
Diffstat (limited to 'cpp/src/IceUtil/StringConverter.cpp')
-rw-r--r-- | cpp/src/IceUtil/StringConverter.cpp | 637 |
1 files changed, 374 insertions, 263 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp index 46c590c42f3..2613efd6fe9 100644 --- a/cpp/src/IceUtil/StringConverter.cpp +++ b/cpp/src/IceUtil/StringConverter.cpp @@ -10,9 +10,14 @@ #include <IceUtil/StringConverter.h> #include <IceUtil/MutexPtrLock.h> #include <IceUtil/Mutex.h> -#include <IceUtil/ScopedArray.h> #include <IceUtil/StringUtil.h> + +#ifdef ICE_HAS_CODECVT_UTF8 +#include <codecvt> +#include <locale> +#else #include <IceUtil/Unicode.h> +#endif using namespace IceUtil; using namespace IceUtilInternal; @@ -25,268 +30,291 @@ IceUtil::Mutex* processStringConverterMutex = 0; IceUtil::StringConverterPtr processStringConverter; IceUtil::WstringConverterPtr processWstringConverter; -class Init -{ -public: +#ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC +IceUtil::WstringConverterPtr unicodeWstringConverter; +#endif - Init() - { - processStringConverterMutex = new IceUtil::Mutex; - } +#ifdef ICE_HAS_CODECVT_UTF8 - ~Init() - { - delete processStringConverterMutex; - processStringConverterMutex = 0; - } -}; - -Init init; +template<size_t wcharSize> +struct SelectCodeCvt; -} +template<> +struct SelectCodeCvt<2> +{ + typedef std::codecvt_utf8_utf16<wchar_t> Type; +}; -namespace +template<> +struct SelectCodeCvt<4> { + typedef std::codecvt_utf8<wchar_t> Type; +}; -class UTF8BufferI : public UTF8Buffer +class UnicodeWstringConverter : public WstringConverter { public: - UTF8BufferI() : - _buffer(0), - _offset(0) +#if defined(_MSC_VER) && (_MSC_VER <= 1800) + // + // VS 2013 needs a default ctor + // + UnicodeWstringConverter() { } +#endif - ~UTF8BufferI() + virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const { - free(_buffer); + // + // Max bytes for a character encoding in UTF-8 is 4, + // however MSVC returns 6 + // +#ifdef _MSC_VER + assert(_codecvt.max_length() == 4 || _codecvt.max_length() == 6); +#else + assert(_codecvt.max_length() == 4); +#endif + if(sourceStart == sourceEnd) + { + return buffer.getMoreBytes(1, 0); + } + + char* targetStart = 0; + char* targetEnd = 0; + char* targetNext = 0; + + mbstate_t state = mbstate_t(); // must be initialized! + const wchar_t* sourceNext = sourceStart; + + bool more = false; + + // + // The number of bytes we request from buffer for each remaining source character + // + size_t factor = 2; + + do + { + assert(factor <= 4); + const size_t chunkSize = std::max<size_t>((sourceEnd - sourceStart) * factor, 4); + ++factor; // at the next round, we'll allocate more bytes per remaining source character + + targetStart = reinterpret_cast<char*>(buffer.getMoreBytes(chunkSize, reinterpret_cast<Byte*>(targetNext))); + targetEnd = targetStart + chunkSize; + targetNext = targetStart; + + codecvt_base::result result = + _codecvt.out(state, sourceStart, sourceEnd, sourceNext, targetStart, targetEnd, targetNext); + + switch(result) + { + case codecvt_base::ok: + // + // MSVC returns ok when target is exhausted + // + more = sourceNext < sourceEnd; + break; + + case codecvt_base::partial: + // + // clang/libc++ and g++5 return partial when target is exhausted + // + more = true; + assert(sourceNext < sourceEnd); + break; + + case codecvt_base::noconv: + // + // Unexpected + // + assert(0); + throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out noconv"); + + default: + throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out error"); + } + + if(targetStart == targetNext) + { + // We didn't convert a single character + throw IllegalConversionException(__FILE__, __LINE__, + "no character converted by codecvt.out"); + } + + sourceStart = sourceNext; + } while (more); + + return reinterpret_cast<Byte*>(targetNext); } - Byte* getMoreBytes(size_t howMany, Byte* firstUnused) + virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { - if(_buffer == 0) + const size_t sourceSize = sourceEnd - sourceStart; + + if(sourceSize == 0) { - _buffer = static_cast<Byte*>(malloc(howMany)); + target = L""; } else { - assert(firstUnused != 0); - _offset = firstUnused - _buffer; - _buffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany)); - } - - if(!_buffer) - { - throw std::bad_alloc(); - } - return _buffer + _offset; - } + target.resize(sourceSize); + wchar_t* targetStart = const_cast<wchar_t*>(target.data()); + wchar_t* targetEnd = targetStart + sourceSize; + wchar_t* targetNext = targetStart; - Byte* getBuffer() - { - return _buffer; - } + const char* sourceNext = reinterpret_cast<const char*>(sourceStart); - void reset() - { - free(_buffer); - _buffer = 0; - _offset = 0; + mbstate_t state = mbstate_t(); + + codecvt_base::result result = _codecvt.in(state, + reinterpret_cast<const char*>(sourceStart), + reinterpret_cast<const char*>(sourceEnd), + sourceNext, + targetStart, targetEnd, targetNext); + + if(result != codecvt_base::ok) + { + throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure"); + } + + target.resize(targetNext - targetStart); + } } - + private: - IceUtil::Byte* _buffer; - size_t _offset; + typedef SelectCodeCvt<sizeof(wchar_t)>::Type CodeCvt; + const CodeCvt _codecvt; }; -} +#else +class UnicodeWstringConverter : public WstringConverter +{ +public: + virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const + { + if(sourceStart == sourceEnd) + { + return buffer.getMoreBytes(1, 0); + } -UnicodeWstringConverter::UnicodeWstringConverter(ConversionFlags flags) : - _conversionFlags(flags) -{ -} + Byte* targetStart = 0; + Byte* targetEnd = 0; -Byte* -UnicodeWstringConverter::toUTF8(const wchar_t* sourceStart, - const wchar_t* sourceEnd, - UTF8Buffer& buffer) const -{ - // - // The "chunk size" is the maximum of the number of characters in the - // source and 6 (== max bytes necessary to encode one Unicode character). - // - size_t chunkSize = std::max<size_t>(static_cast<size_t>(sourceEnd - sourceStart), 6); + // + // The number of bytes we request from buffer for each remaining source character + // + size_t factor = 2; - Byte* targetStart = buffer.getMoreBytes(chunkSize, 0); - Byte* targetEnd = targetStart + chunkSize; + do + { + assert(factor <= 4); + const size_t chunkSize = std::max<size_t>((sourceEnd - sourceStart) * factor, 4); + ++factor; // at the next round, we'll allocate more bytes per remaining source character - ConversionResult result; + targetStart = buffer.getMoreBytes(chunkSize, targetStart); + targetEnd = targetStart + chunkSize; + } + while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false); - while((result = - convertUTFWstringToUTF8(sourceStart, sourceEnd, - targetStart, targetEnd, _conversionFlags)) - == targetExhausted) - { - targetStart = buffer.getMoreBytes(chunkSize, targetStart); - targetEnd = targetStart + chunkSize; + return targetStart; } - - switch(result) + + virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { - case conversionOK: - break; - case sourceExhausted: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source exhausted"); - case sourceIllegal: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source illegal"); - default: + if(sourceStart == sourceEnd) + { + target = L""; + } + else { - assert(0); - throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + convertUTF8ToUTFWstring(sourceStart, sourceEnd, target); } } - return targetStart; -} +}; +#endif -void -UnicodeWstringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, - wstring& target) const +class Init { - if(sourceStart == sourceEnd) +public: + + Init() { - target = L""; - return; + processStringConverterMutex = new IceUtil::Mutex; +#ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC + unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); +#endif } - ConversionResult result = - convertUTF8ToUTFWstring(sourceStart, sourceEnd, target, _conversionFlags); - - switch(result) + ~Init() { - case conversionOK: - break; - case sourceExhausted: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source exhausted"); - case sourceIllegal: - throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source illegal"); - default: - { - assert(0); - throw IceUtil::IllegalConversionException(__FILE__, __LINE__); - } + delete processStringConverterMutex; + processStringConverterMutex = 0; } -} +}; -#ifdef _WIN32 -WindowsStringConverter::WindowsStringConverter(unsigned int cp) : - _cp(cp) +Init init; + + +const WstringConverterPtr& +getUnicodeWstringConverter() { +#ifdef ICE_HAS_THREAD_SAFE_LOCAL_STATIC + static const WstringConverterPtr unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); +#endif + return unicodeWstringConverter; } -Byte* -WindowsStringConverter::toUTF8(const char* sourceStart, - const char* sourceEnd, - UTF8Buffer& buffer) const +class UTF8BufferI : public UTF8Buffer { - // - // First convert to UTF-16 - // - int sourceSize = static_cast<int>(sourceEnd - sourceStart); - if(sourceSize == 0) - { - return buffer.getMoreBytes(1, 0); - } +public: - int size = 0; - int writtenWchar = 0; - ScopedArray<wchar_t> wbuffer; - // - // The following code pages doesn't support MB_ERR_INVALID_CHARS flag - // see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx + // Returns the first unused byte in the resized buffer // - DWORD flags = - (_cp == 50220 || _cp == 50221 || _cp == 50222 || - _cp == 50225 || _cp == 50227 || _cp == 50229 || - _cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS; - - do - { - size = size == 0 ? sourceSize + 2 : 2 * size; - wbuffer.reset(new wchar_t[size]); - - writtenWchar = MultiByteToWideChar(_cp, flags, sourceStart, - sourceSize, wbuffer.get(), size); - } while(writtenWchar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER); - - if(writtenWchar == 0) + Byte* getMoreBytes(size_t howMany, Byte* firstUnused) { - throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString()); - } + size_t bytesUsed = 0; + if(firstUnused != 0) + { + bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data()); + } - // - // Then convert this UTF-16 wbuffer into UTF-8 - // - return _unicodeWstringConverter.toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer); -} + if(_buffer.size() < howMany + bytesUsed) + { + _buffer.resize(bytesUsed + howMany); + } -void -WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, - string& target) const -{ - if(sourceStart == sourceEnd) - { - target = ""; - return; + return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed; } - if(_cp == CP_UTF8) + void swap(string& other, const Byte* tail) { - string tmp(reinterpret_cast<const char*>(sourceStart), sourceEnd - sourceStart); - tmp.swap(target); - return; + assert(tail >= reinterpret_cast<const Byte*>(_buffer.data())); + _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data())); + other.swap(_buffer); } - // - // First convert to wstring (UTF-16) - // - wstring wtarget; - _unicodeWstringConverter.fromUTF8(sourceStart, sourceEnd, wtarget); - - // - // WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and - // 54936 (GB18030 Simplified Chinese) - // - DWORD flags = (_cp == 65001 || _cp == 54936) ? WC_ERR_INVALID_CHARS : 0; - // - // And then to a multi-byte narrow string - // - int size = 0; - int writtenChar = 0; - ScopedArray<char> buffer; - do - { - size = size == 0 ? static_cast<int>(sourceEnd - sourceStart) + 2 : 2 * size; - buffer.reset(new char[size]); - writtenChar = WideCharToMultiByte(_cp, flags, wtarget.data(), static_cast<int>(wtarget.size()), - buffer.get(), size, 0, 0); - } while(writtenChar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER); - - if(writtenChar == 0) - { - throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString()); - } +private: + string _buffer; +}; - target.assign(buffer.get(), writtenChar); } -#endif +IceUtil::UTF8Buffer::~UTF8Buffer() +{ + // Out of line to avoid weak vtable +} +WstringConverterPtr +IceUtil::createUnicodeWstringConverter() +{ + return getUnicodeWstringConverter(); +} StringConverterPtr IceUtil::getProcessStringConverter() @@ -298,75 +326,54 @@ IceUtil::getProcessStringConverter() void IceUtil::setProcessStringConverter(const StringConverterPtr& converter) { - IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); - processStringConverter = converter; + IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); + processStringConverter = converter; } WstringConverterPtr IceUtil::getProcessWstringConverter() { IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); - return processWstringConverter; + if(processWstringConverter) + { + return processWstringConverter; + } + else + { + return getUnicodeWstringConverter(); + } } void IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter) { - IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); - processWstringConverter = converter; + IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex); + processWstringConverter = converter; } string -IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter, - IceUtil::ConversionFlags flags) +IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { string target; if(!v.empty()) { + const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); + // - // First convert to UTF8 narrow string. + // First convert to UTF-8 narrow string. // - if(wConverter) - { - UTF8BufferI buffer; - Byte* last = wConverter->toUTF8(v.data(), v.data() + v.size(), buffer); - target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); - } - else - { - size_t size = v.size() * 4 * sizeof(char); - - Byte* outBuf = new Byte[size]; - Byte* targetStart = outBuf; - Byte* targetEnd = outBuf + size; - - const wchar_t* sourceStart = v.data(); - - ConversionResult cr = - convertUTFWstringToUTF8( - sourceStart, sourceStart + v.size(), - targetStart, targetEnd, flags); - - if(cr != conversionOK) - { - delete[] outBuf; - assert(cr == sourceExhausted || cr == sourceIllegal); - throw IllegalConversionException(__FILE__, __LINE__, - cr == sourceExhausted ? "partial character" : "bad encoding"); - } - - target = string(reinterpret_cast<char*>(outBuf), static_cast<size_t>(targetStart - outBuf)); - delete[] outBuf; - } + UTF8BufferI buffer; + Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer); + buffer.swap(target, last); // - // If narrow string converter is present convert to the native narrow string encoding, otherwise + // If narrow string converter is present convert to the native narrow string encoding, otherwise // native narrow string encoding is UTF8 and we are done. // if(converter) { string tmp; - converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()), + converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()), reinterpret_cast<const Byte*>(target.data() + target.size()), tmp); tmp.swap(target); } @@ -375,8 +382,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, } wstring -IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, - const WstringConverterPtr& wConverter, IceUtil::ConversionFlags flags) +IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { wstring target; if(!v.empty()) @@ -390,36 +396,21 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, { UTF8BufferI buffer; Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer); - tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); + buffer.swap(tmp, last); } else { tmp = v; } + const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); + // - // If there is a wide string converter use fromUTF8 to convert to the wide string native encoding. + // Convert from UTF-8 to the wide string encoding // - if(wConverter) - { - wConverter->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()), - reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target); - } - else - { - const Byte* sourceStart = reinterpret_cast<const Byte*>(tmp.data()); - - ConversionResult cr = - convertUTF8ToUTFWstring(sourceStart, sourceStart + tmp.size(), target, flags); - - if(cr != conversionOK) - { - assert(cr == sourceExhausted || cr == sourceIllegal); + wConverterWithDefault->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()), + reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target); - throw IllegalConversionException(__FILE__, __LINE__, - cr == sourceExhausted ? "partial character" : "bad encoding"); - } - } } return target; } @@ -430,10 +421,12 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv if(!converter || str.empty()) { return str; - } + } UTF8BufferI buffer; Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer); - return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer()); + string result; + buffer.swap(result, last); + return result; } string @@ -448,3 +441,121 @@ IceUtil::UTF8ToNative(const string& str, const IceUtil::StringConverterPtr& conv reinterpret_cast<const Byte*>(str.data() + str.size()), tmp); return tmp; } + +#ifdef ICE_HAS_CODECVT_UTF8 + +#if defined(_MSC_VER) && (_MSC_VER == 1900) +// +// Workaround for compiler bug - see http://stackoverflow.com/questions/32055357 +// +typedef unsigned short Char16T; +typedef unsigned int Char32T; + +#else +typedef char16_t Char16T; +typedef char32_t Char32T; +#endif + +#endif + + +vector<unsigned short> +IceUtilInternal::toUTF16(const vector<Byte>& source) +{ + vector<unsigned short> result; + if(!source.empty()) + { + +#ifdef ICE_HAS_CODECVT_UTF8 + assert(sizeof(Char16T) == sizeof(unsigned short)); + + typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert; + + Convert convert; + + try + { + Convert::wide_string ws = convert.from_bytes(reinterpret_cast<const char*>(&source.front()), + reinterpret_cast<const char*>(&source.front() + source.size())); + + result = vector<unsigned short>(reinterpret_cast<const unsigned short*>(ws.data()), + reinterpret_cast<const unsigned short*>(ws.data()) + ws.length()); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + +#else + convertUTF8ToUTF16(source, result); +#endif + } + return result; +} + +vector<unsigned int> +IceUtilInternal::toUTF32(const vector<Byte>& source) +{ + vector<unsigned int> result; + if(!source.empty()) + { + +#ifdef ICE_HAS_CODECVT_UTF8 + assert(sizeof(Char32T) == sizeof(unsigned int)); + + typedef wstring_convert<codecvt_utf8<Char32T>, Char32T> Convert; + Convert convert; + + try + { + Convert::wide_string ws = convert.from_bytes(reinterpret_cast<const char*>(&source.front()), + reinterpret_cast<const char*>(&source.front() + source.size())); + + result = vector<unsigned int>(reinterpret_cast<const unsigned int*>(ws.data()), + reinterpret_cast<const unsigned int*>(ws.data()) + ws.length()); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + +#else + convertUTF8ToUTF32(source, result); +#endif + } + return result; +} + + +vector<Byte> +IceUtilInternal::fromUTF32(const vector<unsigned int>& source) +{ + vector<Byte> result; + if(!source.empty()) + { + +#ifdef ICE_HAS_CODECVT_UTF8 + assert(sizeof(Char32T) == sizeof(unsigned int)); + + typedef wstring_convert<codecvt_utf8<Char32T>, Char32T> Convert; + Convert convert; + + try + { + Convert::byte_string bs = convert.to_bytes(reinterpret_cast<const Char32T*>(&source.front()), + reinterpret_cast<const Char32T*>(&source.front() + source.size())); + + result = vector<Byte>(reinterpret_cast<const Byte*>(bs.data()), + reinterpret_cast<const Byte*>(bs.data()) + bs.length()); + } + catch(const std::range_error& ex) + { + throw IllegalConversionException(__FILE__, __LINE__, ex.what()); + } + +#else + convertUTF32ToUTF8(source, result); +#endif + } + return result; +} |