// ********************************************************************** // // Copyright (c) 2003-2017 ZeroC, Inc. All rights reserved. // // This copy of Ice is licensed to you under the terms described in the // ICE_LICENSE file included in this distribution. // // ********************************************************************** #include #include #include #include #ifdef ICE_HAS_CODECVT_UTF8 #include #include #else #include #endif using namespace IceUtil; using namespace IceUtilInternal; using namespace std; namespace { IceUtil::Mutex* processStringConverterMutex = 0; IceUtil::StringConverterPtr processStringConverter; IceUtil::WstringConverterPtr processWstringConverter; #ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC IceUtil::WstringConverterPtr unicodeWstringConverter; #endif #ifdef ICE_HAS_CODECVT_UTF8 template struct SelectCodeCvt; template<> struct SelectCodeCvt<2> { typedef std::codecvt_utf8_utf16 Type; }; template<> struct SelectCodeCvt<4> { typedef std::codecvt_utf8 Type; }; class UnicodeWstringConverter : public WstringConverter { public: #if defined(_MSC_VER) && (_MSC_VER <= 1800) // // VS 2013 needs a default ctor // UnicodeWstringConverter() { } #endif virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const { // // Max bytes for a character encoding in UTF-8 is 4, // however MSVC returns 6 // #ifdef _MSC_VER assert(_codecvt.max_length() == 4 || _codecvt.max_length() == 6); #else assert(_codecvt.max_length() == 4); #endif if(sourceStart == sourceEnd) { return buffer.getMoreBytes(1, 0); } char* targetStart = 0; char* targetEnd = 0; char* targetNext = 0; mbstate_t state = mbstate_t(); // must be initialized! const wchar_t* sourceNext = sourceStart; bool more = false; // // The number of bytes we request from buffer for each remaining source character // size_t factor = 2; do { assert(factor <= 4); const size_t chunkSize = std::max((sourceEnd - sourceStart) * factor, 4); ++factor; // at the next round, we'll allocate more bytes per remaining source character targetStart = reinterpret_cast(buffer.getMoreBytes(chunkSize, reinterpret_cast(targetNext))); targetEnd = targetStart + chunkSize; targetNext = targetStart; codecvt_base::result result = _codecvt.out(state, sourceStart, sourceEnd, sourceNext, targetStart, targetEnd, targetNext); switch(result) { case codecvt_base::ok: // // MSVC returns ok when target is exhausted // more = sourceNext < sourceEnd; break; case codecvt_base::partial: // // clang/libc++ and g++5 return partial when target is exhausted // more = true; assert(sourceNext < sourceEnd); break; case codecvt_base::noconv: // // Unexpected // assert(0); throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out noconv"); default: throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out error"); } if(targetStart == targetNext) { // We didn't convert a single character throw IllegalConversionException(__FILE__, __LINE__, "no character converted by codecvt.out"); } sourceStart = sourceNext; } while (more); return reinterpret_cast(targetNext); } virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { const size_t sourceSize = sourceEnd - sourceStart; if(sourceSize == 0) { target = L""; } else { target.resize(sourceSize); wchar_t* targetStart = const_cast(target.data()); wchar_t* targetEnd = targetStart + sourceSize; wchar_t* targetNext = targetStart; const char* sourceNext = reinterpret_cast(sourceStart); mbstate_t state = mbstate_t(); codecvt_base::result result = _codecvt.in(state, reinterpret_cast(sourceStart), reinterpret_cast(sourceEnd), sourceNext, targetStart, targetEnd, targetNext); if(result != codecvt_base::ok) { throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure"); } target.resize(targetNext - targetStart); } } private: typedef SelectCodeCvt::Type CodeCvt; const CodeCvt _codecvt; }; #else class UnicodeWstringConverter : public WstringConverter { public: virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const { if(sourceStart == sourceEnd) { return buffer.getMoreBytes(1, 0); } Byte* targetStart = 0; Byte* targetEnd = 0; // // The number of bytes we request from buffer for each remaining source character // size_t factor = 2; do { assert(factor <= 4); const size_t chunkSize = std::max((sourceEnd - sourceStart) * factor, 4); ++factor; // at the next round, we'll allocate more bytes per remaining source character targetStart = buffer.getMoreBytes(chunkSize, targetStart); targetEnd = targetStart + chunkSize; } while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false); return targetStart; } virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const { if(sourceStart == sourceEnd) { target = L""; } else { convertUTF8ToUTFWstring(sourceStart, sourceEnd, target); } } }; #endif class Init { public: Init() { processStringConverterMutex = new IceUtil::Mutex; #ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); #endif } ~Init() { delete processStringConverterMutex; processStringConverterMutex = 0; } }; Init init; const WstringConverterPtr& getUnicodeWstringConverter() { #ifdef ICE_HAS_THREAD_SAFE_LOCAL_STATIC static const WstringConverterPtr unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter); #endif return unicodeWstringConverter; } class UTF8BufferI : public UTF8Buffer { public: // // Returns the first unused byte in the resized buffer // Byte* getMoreBytes(size_t howMany, Byte* firstUnused) { size_t bytesUsed = 0; if(firstUnused != 0) { bytesUsed = firstUnused - reinterpret_cast(_buffer.data()); } if(_buffer.size() < howMany + bytesUsed) { _buffer.resize(bytesUsed + howMany); } return const_cast(reinterpret_cast(_buffer.data())) + bytesUsed; } void swap(string& other, const Byte* tail) { assert(tail >= reinterpret_cast(_buffer.data())); _buffer.resize(tail - reinterpret_cast(_buffer.data())); other.swap(_buffer); } private: string _buffer; }; } IceUtil::UTF8Buffer::~UTF8Buffer() { // Out of line to avoid weak vtable } WstringConverterPtr IceUtil::createUnicodeWstringConverter() { return getUnicodeWstringConverter(); } StringConverterPtr IceUtil::getProcessStringConverter() { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); return processStringConverter; } void IceUtil::setProcessStringConverter(const StringConverterPtr& converter) { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); processStringConverter = converter; } WstringConverterPtr IceUtil::getProcessWstringConverter() { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); if(processWstringConverter) { return processWstringConverter; } else { return getUnicodeWstringConverter(); } } void IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter) { IceUtilInternal::MutexPtrLock lock(processStringConverterMutex); processWstringConverter = converter; } string IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { string target; if(!v.empty()) { const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); // // First convert to UTF-8 narrow string. // UTF8BufferI buffer; Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer); buffer.swap(target, last); // // If narrow string converter is present convert to the native narrow string encoding, otherwise // native narrow string encoding is UTF8 and we are done. // if(converter) { string tmp; converter->fromUTF8(reinterpret_cast(target.data()), reinterpret_cast(target.data() + target.size()), tmp); tmp.swap(target); } } return target; } wstring IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter) { wstring target; if(!v.empty()) { // // If there is a narrow string converter use it to convert to UTF8, otherwise the narrow // string is already UTF8 encoded. // string tmp; if(converter) { UTF8BufferI buffer; Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer); buffer.swap(tmp, last); } else { tmp = v; } const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter(); // // Convert from UTF-8 to the wide string encoding // wConverterWithDefault->fromUTF8(reinterpret_cast(tmp.data()), reinterpret_cast(tmp.data() + tmp.size()), target); } return target; } string IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& converter) { if(!converter || str.empty()) { return str; } UTF8BufferI buffer; Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer); string result; buffer.swap(result, last); return result; } string IceUtil::UTF8ToNative(const string& str, const IceUtil::StringConverterPtr& converter) { if(!converter || str.empty()) { return str; } string tmp; converter->fromUTF8(reinterpret_cast(str.data()), reinterpret_cast(str.data() + str.size()), tmp); return tmp; } #ifdef ICE_HAS_CODECVT_UTF8 #if defined(_MSC_VER) && (_MSC_VER >= 1900) // // Workaround for compiler bug - see http://stackoverflow.com/questions/32055357 // typedef unsigned short Char16T; typedef unsigned int Char32T; #else typedef char16_t Char16T; typedef char32_t Char32T; #endif #endif vector IceUtilInternal::toUTF16(const vector& source) { vector result; if(!source.empty()) { #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char16T) == sizeof(unsigned short)); typedef wstring_convert, Char16T> Convert; Convert convert; try { Convert::wide_string ws = convert.from_bytes(reinterpret_cast(&source.front()), reinterpret_cast(&source.front() + source.size())); result = vector(reinterpret_cast(ws.data()), reinterpret_cast(ws.data()) + ws.length()); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } #else convertUTF8ToUTF16(source, result); #endif } return result; } vector IceUtilInternal::toUTF32(const vector& source) { vector result; if(!source.empty()) { #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char32T) == sizeof(unsigned int)); typedef wstring_convert, Char32T> Convert; Convert convert; try { Convert::wide_string ws = convert.from_bytes(reinterpret_cast(&source.front()), reinterpret_cast(&source.front() + source.size())); result = vector(reinterpret_cast(ws.data()), reinterpret_cast(ws.data()) + ws.length()); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } #else convertUTF8ToUTF32(source, result); #endif } return result; } vector IceUtilInternal::fromUTF32(const vector& source) { vector result; if(!source.empty()) { #ifdef ICE_HAS_CODECVT_UTF8 assert(sizeof(Char32T) == sizeof(unsigned int)); typedef wstring_convert, Char32T> Convert; Convert convert; try { Convert::byte_string bs = convert.to_bytes(reinterpret_cast(&source.front()), reinterpret_cast(&source.front() + source.size())); result = vector(reinterpret_cast(bs.data()), reinterpret_cast(bs.data()) + bs.length()); } catch(const std::range_error& ex) { throw IllegalConversionException(__FILE__, __LINE__, ex.what()); } #else convertUTF32ToUTF8(source, result); #endif } return result; } #ifdef _WIN32 namespace { // // Converts to/from UTF-8 using MultiByteToWideChar and WideCharToMultiByte // class WindowsStringConverter : public StringConverter { public: explicit WindowsStringConverter(unsigned int); virtual Byte* toUTF8(const char*, const char*, UTF8Buffer&) const; virtual void fromUTF8(const Byte*, const Byte*, string& target) const; private: unsigned int _cp; }; WindowsStringConverter::WindowsStringConverter(unsigned int cp) : _cp(cp) { } Byte* WindowsStringConverter::toUTF8(const char* sourceStart, const char* sourceEnd, UTF8Buffer& buffer) const { // // First convert to UTF-16 // int sourceSize = static_cast(sourceEnd - sourceStart); if(sourceSize == 0) { return buffer.getMoreBytes(1, 0); } int writtenWchar = 0; wstring wbuffer; // // The following code pages doesn't support MB_ERR_INVALID_CHARS flag // see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx // DWORD flags = (_cp == 50220 || _cp == 50221 || _cp == 50222 || _cp == 50225 || _cp == 50227 || _cp == 50229 || _cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS; do { wbuffer.resize(wbuffer.size() == 0 ? sourceSize + 2 : 2 * wbuffer.size()); writtenWchar = MultiByteToWideChar(_cp, flags, sourceStart, sourceSize, const_cast(wbuffer.data()), static_cast(wbuffer.size())); } while(writtenWchar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER); if(writtenWchar == 0) { throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString()); } wbuffer.resize(static_cast(writtenWchar)); // // Then convert this UTF-16 wbuffer into UTF-8 // return getUnicodeWstringConverter()->toUTF8(wbuffer.data(), wbuffer.data() + wbuffer.size(), buffer); } void WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, string& target) const { if(sourceStart == sourceEnd) { target = ""; return; } if(_cp == CP_UTF8) { string tmp(reinterpret_cast(sourceStart), sourceEnd - sourceStart); tmp.swap(target); return; } // // First convert to wstring (UTF-16) // wstring wtarget; getUnicodeWstringConverter()->fromUTF8(sourceStart, sourceEnd, wtarget); // // WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and // 54936 (GB18030 Simplified Chinese) // DWORD flags = (_cp == 65001 || _cp == 54936) ? WC_ERR_INVALID_CHARS : 0; // // And then to a multi-byte narrow string // int writtenChar = -1; do { target.resize(writtenChar == -1 ? std::max(sourceEnd - sourceStart + 2, target.size()) : 2 * target.size()); writtenChar = WideCharToMultiByte(_cp, flags, wtarget.data(), static_cast(wtarget.size()), const_cast(target.data()), static_cast(target.size()), 0, 0); } while(writtenChar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER); if(writtenChar == 0) { throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString()); } target.resize(static_cast(writtenChar)); } } StringConverterPtr IceUtil::createWindowsStringConverter(unsigned int cp) { return ICE_MAKE_SHARED(WindowsStringConverter, cp); } #endif