// ********************************************************************** // // Copyright (c) 2003-2016 ZeroC, Inc. All rights reserved. // // This copy of Ice is licensed to you under the terms described in the // ICE_LICENSE file included in this distribution. // // ********************************************************************** #include #include #include #include #include #ifdef ICE_HAS_CODECVT_UTF8 #include #include #else #include #endif using namespace IceUtil; using namespace IceUtilInternal; using namespace std; namespace { IceUtil::Mutex* processStringConverterMutex = 0; IceUtil::StringConverterPtr processStringConverter; IceUtil::WstringConverterPtr processWstringConverter; #ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC IceUtil::WstringConverterPtr unicodeWstringConverter; #endif #ifdef ICE_HAS_CODECVT_UTF8 template struct SelectCodeCvt; template<> struct SelectCodeCvt<2> { #ifdef ICE_LITTLE_ENDIAN typedef std::codecvt_utf8_utf16 Type; #else typedef std::codecvt_utf8_utf16 Type; #endif }; template<> struct SelectCodeCvt<4> { typedef std::codecvt_utf8 Type; }; class UnicodeWstringConverter : public WstringConverter { public: #if defined(_MSC_VER) && (_MSC_VER <= 1800) // // VS 2013 needs a default ctor // UnicodeWstringConverter() { } #endif virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const { // // Max bytes for a character encoding in UTF-8 is 4, // however MSVC returns 6 // #ifdef _MSC_VER assert(_codecvt.max_length() == 4 || _codecvt.max_length() == 6); #else assert(_codecvt.max_length() == 4); #endif if(sourceStart == sourceEnd) { return buffer.getMoreBytes(1, 0); } char* targetStart = 0; char* targetEnd = 0; char* targetNext = 0; mbstate_t state = mbstate_t(); // must be initialized! const wchar_t* sourceNext = sourceStart; bool more = false; // // The number of bytes we request from buffer for each remaining source character // size_t factor = 2; do { assert(factor <= 4); const size_t chunkSize = std::max((sourceEnd - sourceStart) * factor, 4); ++factor; // at the next round, we'll allocate more bytes per remaining source character targetStart = reinterpret_cast(buffer.getMoreBytes(chunkSize, reinterpret_cast(targetNext))); targetEnd = targetStart + chunkSize; targetNext = targetStart; codecvt_base::result result = _codecvt.out(state, sourceStart, sourceEnd, sourceNext, targetStart, targetEnd, targetNext); switch(result) { case codecvt_base::ok: // // MSVC returns ok when target is exhausted // more = sourceNext < sourceEnd; break; case codecvt_base::partial: // // clang/libc++ and g++5 return partial when target is exhausted // more = true; assert(sourceNext < sourceEnd); break; case codecvt_base::noconv: // // Unexpected // assert(0); throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out noconv"); default: throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out error"); } if(targetStart == targetNext) { // We didn't convert a single character throw IllegalConversionException(__FILE__, __LINE__, "no character converted by codecvt.out"); } sourceStart = sourceNext; } while (more); return reinterpret_cast(targetNext); } virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { if(sourceStart == sourceEnd) { target = L""; } else { // // TODO: consider reimplementing without the wstring_convert helper // to improve performance // Note that wstring_convert is "stateful" and cannot be a shared data member // wstring_convert convert; try { target = convert.from_bytes(reinterpret_cast(sourceStart), reinterpret_cast(sourceEnd)); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } } } private: typedef SelectCodeCvt::Type CodeCvt; const CodeCvt _codecvt; }; #else class UnicodeWstringConverter : public WstringConverter { public: virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const { if(sourceStart == sourceEnd) { return buffer.getMoreBytes(1, 0); } Byte* targetStart = 0; Byte* targetEnd = 0; // // The number of bytes we request from buffer for each remaining source character // size_t factor = 2; do { assert(factor <= 4); const size_t chunkSize = std::max((sourceEnd - sourceStart) * factor, 4); ++factor; // at the next round, we'll allocate more bytes per remaining source character targetStart = buffer.getMoreBytes(chunkSize, targetStart); targetEnd = targetStart + chunkSize; } while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false); return targetStart; } virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { if(sourceStart == sourceEnd) { target = L""; } else { convertUTF8ToUTFWstring(sourceStart, sourceEnd, target); } } }; #endif #ifdef _WIN32 // // Converts to/from UTF-8 using MultiByteToWideChar and WideCharToMultiByte // class WindowsStringConverter : public StringConverter { public: explicit WindowsStringConverter(unsigned int); virtual Byte* toUTF8(const char*, const char*, UTF8Buffer&) const; virtual void fromUTF8(const Byte*, const Byte*, string& target) const; private: unsigned int _cp; }; #endif class Init { public: Init() { processStringConverterMutex = new IceUtil::Mutex; #ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); #endif } ~Init() { delete processStringConverterMutex; processStringConverterMutex = 0; } }; Init init; const WstringConverterPtr& getUnicodeWstringConverter() { #ifdef ICE_HAS_THREAD_SAFE_LOCAL_STATIC static const WstringConverterPtr unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); #endif return unicodeWstringConverter; } class UTF8BufferI : public UTF8Buffer { public: UTF8BufferI() : _buffer(0), _offset(0) { } ~UTF8BufferI() { free(_buffer); } Byte* getMoreBytes(size_t howMany, Byte* firstUnused) { if(_buffer == 0) { _buffer = static_cast(malloc(howMany)); if(!_buffer) { throw std::bad_alloc(); } } else { assert(firstUnused != 0); _offset = firstUnused - _buffer; Byte* newBuffer = static_cast(realloc(_buffer, _offset + howMany)); if(!newBuffer) { reset(); throw std::bad_alloc(); } else { _buffer = newBuffer; } } return _buffer + _offset; } Byte* getBuffer() { return _buffer; } void reset() { free(_buffer); _buffer = 0; _offset = 0; } private: Byte* _buffer; size_t _offset; }; #ifdef _WIN32 WindowsStringConverter::WindowsStringConverter(unsigned int cp) : _cp(cp) { } Byte* WindowsStringConverter::toUTF8(const char* sourceStart, const char* sourceEnd, UTF8Buffer& buffer) const { // // First convert to UTF-16 // int sourceSize = static_cast(sourceEnd - sourceStart); if(sourceSize == 0) { return buffer.getMoreBytes(1, 0); } int size = 0; int writtenWchar = 0; ScopedArray wbuffer; // // The following code pages doesn't support MB_ERR_INVALID_CHARS flag // see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx // DWORD flags = (_cp == 50220 || _cp == 50221 || _cp == 50222 || _cp == 50225 || _cp == 50227 || _cp == 50229 || _cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS; do { size = size == 0 ? sourceSize + 2 : 2 * size; wbuffer.reset(new wchar_t[size]); writtenWchar = MultiByteToWideChar(_cp, flags, sourceStart, sourceSize, wbuffer.get(), size); } while(writtenWchar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER); if(writtenWchar == 0) { throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString()); } // // Then convert this UTF-16 wbuffer into UTF-8 // return getUnicodeWstringConverter()->toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer); } void WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, string& target) const { if(sourceStart == sourceEnd) { target = ""; return; } if(_cp == CP_UTF8) { string tmp(reinterpret_cast(sourceStart), sourceEnd - sourceStart); tmp.swap(target); return; } // // First convert to wstring (UTF-16) // wstring wtarget; getUnicodeWstringConverter()->fromUTF8(sourceStart, sourceEnd, wtarget); // // WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and // 54936 (GB18030 Simplified Chinese) // DWORD flags = (_cp == 65001 || _cp == 54936) ? WC_ERR_INVALID_CHARS : 0; // // And then to a multi-byte narrow string // int size = 0; int writtenChar = 0; ScopedArray buffer; do { size = size == 0 ? static_cast(sourceEnd - sourceStart) + 2 : 2 * size; buffer.reset(new char[size]); writtenChar = WideCharToMultiByte(_cp, flags, wtarget.data(), static_cast(wtarget.size()), buffer.get(), size, 0, 0); } while(writtenChar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER); if(writtenChar == 0) { throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString()); } target.assign(buffer.get(), writtenChar); } #endif } WstringConverterPtr IceUtil::createUnicodeWstringConverter() { return getUnicodeWstringConverter(); } #ifdef _WIN32 StringConverterPtr IceUtil::createWindowsStringConverter(unsigned int cp) { return ICE_MAKE_SHARED(WindowsStringConverter, cp); } #endif StringConverterPtr IceUtil::getProcessStringConverter() { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); return processStringConverter; } void IceUtil::setProcessStringConverter(const StringConverterPtr& converter) { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); processStringConverter = converter; } WstringConverterPtr IceUtil::getProcessWstringConverter() { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); return processWstringConverter; } void IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter) { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); processWstringConverter = converter; } string IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { string target; if(!v.empty()) { const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); // // First convert to UTF-8 narrow string. // UTF8BufferI buffer; Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer); target = string(reinterpret_cast(buffer.getBuffer()), last - buffer.getBuffer()); // // If narrow string converter is present convert to the native narrow string encoding, otherwise // native narrow string encoding is UTF8 and we are done. // if(converter) { string tmp; converter->fromUTF8(reinterpret_cast(target.data()), reinterpret_cast(target.data() + target.size()), tmp); tmp.swap(target); } } return target; } wstring IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { wstring target; if(!v.empty()) { // // If there is a narrow string converter use it to convert to UTF8, otherwise the narrow // string is already UTF8 encoded. // string tmp; if(converter) { UTF8BufferI buffer; Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer); tmp = string(reinterpret_cast(buffer.getBuffer()), last - buffer.getBuffer()); } else { tmp = v; } const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); // // Convert from UTF-8 to the wide string encoding // wConverterWithDefault->fromUTF8(reinterpret_cast(tmp.data()), reinterpret_cast(tmp.data() + tmp.size()), target); } return target; } string IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& converter) { if(!converter || str.empty()) { return str; } UTF8BufferI buffer; Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer); return string(reinterpret_cast(buffer.getBuffer()), last - buffer.getBuffer()); } string IceUtil::UTF8ToNative(const string& str, const IceUtil::StringConverterPtr& converter) { if(!converter || str.empty()) { return str; } string tmp; converter->fromUTF8(reinterpret_cast(str.data()), reinterpret_cast(str.data() + str.size()), tmp); return tmp; } #ifdef ICE_HAS_CODECVT_UTF8 #if defined(_MSC_VER) && (_MSC_VER == 1900) // // Workaround for compiler bug - see http://stackoverflow.com/questions/32055357 // typedef unsigned short Char16T; typedef unsigned int Char32T; #else typedef char16_t Char16T; typedef char32_t Char32T; #endif #endif vector IceUtilInternal::toUTF16(const vector& source) { vector result; if(!source.empty()) { #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char16T) == sizeof(unsigned short)); #ifdef ICE_LITTLE_ENDIAN typedef wstring_convert, Char16T> Convert; #else typedef wstring_convert, Char16T> Convert; #endif Convert convert; try { Convert::wide_string ws = convert.from_bytes(reinterpret_cast(&source.front()), reinterpret_cast(&source.front() + source.size())); result = vector(reinterpret_cast(ws.data()), reinterpret_cast(ws.data()) + ws.length()); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } #else convertUTF8ToUTF16(source, result); #endif } return result; } vector IceUtilInternal::toUTF32(const vector& source) { vector result; if(!source.empty()) { #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char32T) == sizeof(unsigned int)); typedef wstring_convert, Char32T> Convert; Convert convert; try { Convert::wide_string ws = convert.from_bytes(reinterpret_cast(&source.front()), reinterpret_cast(&source.front() + source.size())); result = vector(reinterpret_cast(ws.data()), reinterpret_cast(ws.data()) + ws.length()); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } #else convertUTF8ToUTF32(source, result); #endif } return result; } vector IceUtilInternal::fromUTF32(const vector& source) { vector result; if(!source.empty()) { #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char32T) == sizeof(unsigned int)); typedef wstring_convert, Char32T> Convert; Convert convert; try { Convert::byte_string bs = convert.to_bytes(reinterpret_cast(&source.front()), reinterpret_cast(&source.front() + source.size())); result = vector(reinterpret_cast(bs.data()), reinterpret_cast(bs.data()) + bs.length()); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } #else convertUTF32ToUTF8(source, result); #endif } return result; }