summaryrefslogtreecommitdiff
path: root/cpp/src/IceUtil/StringConverter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'cpp/src/IceUtil/StringConverter.cpp')
-rw-r--r--cpp/src/IceUtil/StringConverter.cpp586
1 files changed, 426 insertions, 160 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
index 46c590c42f3..8b60c48d53f 100644
--- a/cpp/src/IceUtil/StringConverter.cpp
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -12,7 +12,13 @@
#include <IceUtil/Mutex.h>
#include <IceUtil/ScopedArray.h>
#include <IceUtil/StringUtil.h>
+
+#ifdef ICE_HAS_CODECVT_UTF8
+#include <codecvt>
+#include <locale>
+#else
#include <IceUtil/Unicode.h>
+#endif
using namespace IceUtil;
using namespace IceUtilInternal;
@@ -25,6 +31,234 @@ IceUtil::Mutex* processStringConverterMutex = 0;
IceUtil::StringConverterPtr processStringConverter;
IceUtil::WstringConverterPtr processWstringConverter;
+#ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC
+IceUtil::WstringConverterPtr unicodeWstringConverter;
+#endif
+
+#ifdef ICE_HAS_CODECVT_UTF8
+
+template<size_t wcharSize>
+struct SelectCodeCvt;
+
+template<>
+struct SelectCodeCvt<2>
+{
+#ifdef ICE_LITTLE_ENDIAN
+ typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type;
+#else
+ typedef std::codecvt_utf8_utf16<wchar_t> Type;
+#endif
+};
+
+template<>
+struct SelectCodeCvt<4>
+{
+ typedef std::codecvt_utf8<wchar_t> Type;
+};
+
+class UnicodeWstringConverter : public WstringConverter
+{
+public:
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1800)
+ //
+ // VS 2013 needs a default ctor
+ //
+ UnicodeWstringConverter()
+ {
+ }
+#endif
+
+ virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const
+ {
+ //
+ // Max bytes for a character encoding in UTF-8 is 4,
+ // however MSVC returns 6
+ //
+#ifdef _MSC_VER
+ assert(_codecvt.max_length() == 4 || _codecvt.max_length() == 6);
+#else
+ assert(_codecvt.max_length() == 4);
+#endif
+ if(sourceStart == sourceEnd)
+ {
+ return buffer.getMoreBytes(1, 0);
+ }
+
+ char* targetStart = 0;
+ char* targetEnd = 0;
+ char* targetNext = 0;
+
+ mbstate_t state = mbstate_t(); // must be initialized!
+ const wchar_t* sourceNext = sourceStart;
+
+ bool more = false;
+
+ //
+ // The number of bytes we request from buffer for each remaining source character
+ //
+ size_t factor = 2;
+
+ do
+ {
+ assert(factor <= 4);
+ const size_t chunkSize = std::max<size_t>((sourceEnd - sourceStart) * factor, 4);
+ ++factor; // at the next round, we'll allocate more bytes per remaining source character
+
+ targetStart = reinterpret_cast<char*>(buffer.getMoreBytes(chunkSize, reinterpret_cast<Byte*>(targetNext)));
+ targetEnd = targetStart + chunkSize;
+ targetNext = targetStart;
+
+ codecvt_base::result result =
+ _codecvt.out(state, sourceStart, sourceEnd, sourceNext, targetStart, targetEnd, targetNext);
+
+ switch(result)
+ {
+ case codecvt_base::ok:
+ //
+ // MSVC returns ok when target is exhausted
+ //
+ more = sourceNext < sourceEnd;
+ break;
+
+ case codecvt_base::partial:
+ //
+ // clang/libc++ and g++5 return partial when target is exhausted
+ //
+ more = true;
+ assert(sourceNext < sourceEnd);
+ break;
+
+ case codecvt_base::noconv:
+ //
+ // Unexpected
+ //
+ assert(0);
+ throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out noconv");
+
+ default:
+ throw IllegalConversionException(__FILE__, __LINE__, "codecvt.out error");
+ }
+
+ if(targetStart == targetNext)
+ {
+ // We didn't convert a single character
+ throw IllegalConversionException(__FILE__, __LINE__,
+ "no character converted by codecvt.out");
+ }
+
+ sourceStart = sourceNext;
+ } while (more);
+
+ return reinterpret_cast<Byte*>(targetNext);
+ }
+
+ virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
+ {
+ if(sourceStart == sourceEnd)
+ {
+ target = L"";
+ }
+ else
+ {
+ //
+ // TODO: consider reimplementing without the wstring_convert helper
+ // to improve performance
+ // Note that wstring_convert is "stateful" and cannot be a shared data member
+ //
+ wstring_convert<CodeCvt> convert;
+
+ try
+ {
+ target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart),
+ reinterpret_cast<const char*>(sourceEnd));
+ }
+ catch(const std::range_error& ex)
+ {
+ throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+ }
+ }
+ }
+
+private:
+
+ typedef SelectCodeCvt<sizeof(wchar_t)>::Type CodeCvt;
+ const CodeCvt _codecvt;
+};
+
+#else
+
+class UnicodeWstringConverter : public WstringConverter
+{
+public:
+
+ virtual Byte* toUTF8(const wchar_t* sourceStart, const wchar_t* sourceEnd, UTF8Buffer& buffer) const
+ {
+ if(sourceStart == sourceEnd)
+ {
+ return buffer.getMoreBytes(1, 0);
+ }
+
+ Byte* targetStart = 0;
+ Byte* targetEnd = 0;
+
+ //
+ // The number of bytes we request from buffer for each remaining source character
+ //
+ size_t factor = 2;
+
+ do
+ {
+ assert(factor <= 4);
+ const size_t chunkSize = std::max<size_t>((sourceEnd - sourceStart) * factor, 4);
+ ++factor; // at the next round, we'll allocate more bytes per remaining source character
+
+ targetStart = buffer.getMoreBytes(chunkSize, targetStart);
+ targetEnd = targetStart + chunkSize;
+
+ }
+ while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false);
+
+ return targetStart;
+ }
+
+
+ virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
+ {
+ if(sourceStart == sourceEnd)
+ {
+ target = L"";
+ }
+ else
+ {
+ convertUTF8ToUTFWstring(sourceStart, sourceEnd, target);
+ }
+ }
+};
+
+#endif
+
+#ifdef _WIN32
+
+//
+// Converts to/from UTF-8 using MultiByteToWideChar and WideCharToMultiByte
+//
+class WindowsStringConverter : public StringConverter
+{
+public:
+
+ explicit WindowsStringConverter(unsigned int);
+
+ virtual Byte* toUTF8(const char*, const char*, UTF8Buffer&) const;
+
+ virtual void fromUTF8(const Byte*, const Byte*, string& target) const;
+
+private:
+ unsigned int _cp;
+};
+#endif
+
+
class Init
{
public:
@@ -32,6 +266,9 @@ public:
Init()
{
processStringConverterMutex = new IceUtil::Mutex;
+#ifndef ICE_HAS_THREAD_SAFE_LOCAL_STATIC
+ unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter);
+#endif
}
~Init()
@@ -43,10 +280,16 @@ public:
Init init;
-}
-namespace
+const WstringConverterPtr&
+getUnicodeWstringConverter()
{
+#ifdef ICE_HAS_THREAD_SAFE_LOCAL_STATIC
+ static const WstringConverterPtr unicodeWstringConverter = ICE_MAKE_SHARED(UnicodeWstringConverter);
+#endif
+ return unicodeWstringConverter;
+}
+
class UTF8BufferI : public UTF8Buffer
{
@@ -68,18 +311,27 @@ public:
if(_buffer == 0)
{
_buffer = static_cast<Byte*>(malloc(howMany));
+ if(!_buffer)
+ {
+ throw std::bad_alloc();
+ }
}
else
{
assert(firstUnused != 0);
_offset = firstUnused - _buffer;
- _buffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
- }
-
- if(!_buffer)
- {
- throw std::bad_alloc();
+ Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
+ if(!newBuffer)
+ {
+ reset();
+ throw std::bad_alloc();
+ }
+ else
+ {
+ _buffer = newBuffer;
+ }
}
+
return _buffer + _offset;
}
@@ -94,94 +346,13 @@ public:
_buffer = 0;
_offset = 0;
}
-
+
private:
- IceUtil::Byte* _buffer;
+ Byte* _buffer;
size_t _offset;
};
-}
-
-
-
-UnicodeWstringConverter::UnicodeWstringConverter(ConversionFlags flags) :
- _conversionFlags(flags)
-{
-}
-
-Byte*
-UnicodeWstringConverter::toUTF8(const wchar_t* sourceStart,
- const wchar_t* sourceEnd,
- UTF8Buffer& buffer) const
-{
- //
- // The "chunk size" is the maximum of the number of characters in the
- // source and 6 (== max bytes necessary to encode one Unicode character).
- //
- size_t chunkSize = std::max<size_t>(static_cast<size_t>(sourceEnd - sourceStart), 6);
-
- Byte* targetStart = buffer.getMoreBytes(chunkSize, 0);
- Byte* targetEnd = targetStart + chunkSize;
-
- ConversionResult result;
-
- while((result =
- convertUTFWstringToUTF8(sourceStart, sourceEnd,
- targetStart, targetEnd, _conversionFlags))
- == targetExhausted)
- {
- targetStart = buffer.getMoreBytes(chunkSize, targetStart);
- targetEnd = targetStart + chunkSize;
- }
-
- switch(result)
- {
- case conversionOK:
- break;
- case sourceExhausted:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source exhausted");
- case sourceIllegal:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source illegal");
- default:
- {
- assert(0);
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
- }
- }
- return targetStart;
-}
-
-
-void
-UnicodeWstringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
- wstring& target) const
-{
- if(sourceStart == sourceEnd)
- {
- target = L"";
- return;
- }
-
- ConversionResult result =
- convertUTF8ToUTFWstring(sourceStart, sourceEnd, target, _conversionFlags);
-
- switch(result)
- {
- case conversionOK:
- break;
- case sourceExhausted:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source exhausted");
- case sourceIllegal:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source illegal");
- default:
- {
- assert(0);
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
- }
- }
-}
-
#ifdef _WIN32
WindowsStringConverter::WindowsStringConverter(unsigned int cp) :
_cp(cp)
@@ -205,16 +376,16 @@ WindowsStringConverter::toUTF8(const char* sourceStart,
int size = 0;
int writtenWchar = 0;
ScopedArray<wchar_t> wbuffer;
-
+
//
// The following code pages doesn't support MB_ERR_INVALID_CHARS flag
// see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx
//
DWORD flags =
(_cp == 50220 || _cp == 50221 || _cp == 50222 ||
- _cp == 50225 || _cp == 50227 || _cp == 50229 ||
+ _cp == 50225 || _cp == 50227 || _cp == 50229 ||
_cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS;
-
+
do
{
size = size == 0 ? sourceSize + 2 : 2 * size;
@@ -232,7 +403,7 @@ WindowsStringConverter::toUTF8(const char* sourceStart,
//
// Then convert this UTF-16 wbuffer into UTF-8
//
- return _unicodeWstringConverter.toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer);
+ return getUnicodeWstringConverter()->toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer);
}
void
@@ -256,7 +427,7 @@ WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
// First convert to wstring (UTF-16)
//
wstring wtarget;
- _unicodeWstringConverter.fromUTF8(sourceStart, sourceEnd, wtarget);
+ getUnicodeWstringConverter()->fromUTF8(sourceStart, sourceEnd, wtarget);
//
// WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and
@@ -284,7 +455,23 @@ WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
target.assign(buffer.get(), writtenChar);
}
+#endif
+
+}
+
+
+WstringConverterPtr
+IceUtil::createUnicodeWstringConverter()
+{
+ return getUnicodeWstringConverter();
+}
+#ifdef _WIN32
+StringConverterPtr
+IceUtil::createWindowsStringConverter(unsigned int cp)
+{
+ return ICE_MAKE_SHARED(WindowsStringConverter, cp);
+}
#endif
@@ -298,8 +485,8 @@ IceUtil::getProcessStringConverter()
void
IceUtil::setProcessStringConverter(const StringConverterPtr& converter)
{
- IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
- processStringConverter = converter;
+ IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+ processStringConverter = converter;
}
WstringConverterPtr
@@ -312,61 +499,33 @@ IceUtil::getProcessWstringConverter()
void
IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter)
{
- IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
- processWstringConverter = converter;
+ IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+ processWstringConverter = converter;
}
string
-IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter,
- IceUtil::ConversionFlags flags)
+IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter)
{
string target;
if(!v.empty())
{
+ const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter();
+
//
- // First convert to UTF8 narrow string.
+ // First convert to UTF-8 narrow string.
//
- if(wConverter)
- {
- UTF8BufferI buffer;
- Byte* last = wConverter->toUTF8(v.data(), v.data() + v.size(), buffer);
- target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
- }
- else
- {
- size_t size = v.size() * 4 * sizeof(char);
-
- Byte* outBuf = new Byte[size];
- Byte* targetStart = outBuf;
- Byte* targetEnd = outBuf + size;
-
- const wchar_t* sourceStart = v.data();
-
- ConversionResult cr =
- convertUTFWstringToUTF8(
- sourceStart, sourceStart + v.size(),
- targetStart, targetEnd, flags);
-
- if(cr != conversionOK)
- {
- delete[] outBuf;
- assert(cr == sourceExhausted || cr == sourceIllegal);
- throw IllegalConversionException(__FILE__, __LINE__,
- cr == sourceExhausted ? "partial character" : "bad encoding");
- }
-
- target = string(reinterpret_cast<char*>(outBuf), static_cast<size_t>(targetStart - outBuf));
- delete[] outBuf;
- }
+ UTF8BufferI buffer;
+ Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer);
+ target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
//
- // If narrow string converter is present convert to the native narrow string encoding, otherwise
+ // If narrow string converter is present convert to the native narrow string encoding, otherwise
// native narrow string encoding is UTF8 and we are done.
//
if(converter)
{
string tmp;
- converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()),
+ converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()),
reinterpret_cast<const Byte*>(target.data() + target.size()), tmp);
tmp.swap(target);
}
@@ -375,8 +534,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
}
wstring
-IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
- const WstringConverterPtr& wConverter, IceUtil::ConversionFlags flags)
+IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
+ const WstringConverterPtr& wConverter)
{
wstring target;
if(!v.empty())
@@ -397,29 +556,14 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
tmp = v;
}
+ const WstringConverterPtr& wConverterWithDefault = wConverter ? wConverter : getUnicodeWstringConverter();
+
//
- // If there is a wide string converter use fromUTF8 to convert to the wide string native encoding.
+ // Convert from UTF-8 to the wide string encoding
//
- if(wConverter)
- {
- wConverter->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()),
- reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target);
- }
- else
- {
- const Byte* sourceStart = reinterpret_cast<const Byte*>(tmp.data());
-
- ConversionResult cr =
- convertUTF8ToUTFWstring(sourceStart, sourceStart + tmp.size(), target, flags);
+ wConverterWithDefault->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()),
+ reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target);
- if(cr != conversionOK)
- {
- assert(cr == sourceExhausted || cr == sourceIllegal);
-
- throw IllegalConversionException(__FILE__, __LINE__,
- cr == sourceExhausted ? "partial character" : "bad encoding");
- }
- }
}
return target;
}
@@ -430,7 +574,7 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv
if(!converter || str.empty())
{
return str;
- }
+ }
UTF8BufferI buffer;
Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
@@ -448,3 +592,125 @@ IceUtil::UTF8ToNative(const string& str, const IceUtil::StringConverterPtr& conv
reinterpret_cast<const Byte*>(str.data() + str.size()), tmp);
return tmp;
}
+
+#ifdef ICE_HAS_CODECVT_UTF8
+
+#if defined(_MSC_VER) && (_MSC_VER == 1900)
+//
+// Workaround for compiler bug - see http://stackoverflow.com/questions/32055357
+//
+typedef unsigned short Char16T;
+typedef unsigned int Char32T;
+
+#else
+typedef char16_t Char16T;
+typedef char32_t Char32T;
+#endif
+
+#endif
+
+
+vector<unsigned short>
+IceUtilInternal::toUTF16(const vector<Byte>& source)
+{
+ vector<unsigned short> result;
+ if(!source.empty())
+ {
+
+#ifdef ICE_HAS_CODECVT_UTF8
+ assert(sizeof(Char16T) == sizeof(unsigned short));
+
+#ifdef ICE_LITTLE_ENDIAN
+ typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert;
+#else
+ typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert;
+#endif
+
+ Convert convert;
+
+ try
+ {
+ Convert::wide_string ws = convert.from_bytes(reinterpret_cast<const char*>(&source.front()),
+ reinterpret_cast<const char*>(&source.front() + source.size()));
+
+ result = vector<unsigned short>(reinterpret_cast<const unsigned short*>(ws.data()),
+ reinterpret_cast<const unsigned short*>(ws.data()) + ws.length());
+ }
+ catch(const std::range_error& ex)
+ {
+ throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+ }
+
+#else
+ convertUTF8ToUTF16(source, result);
+#endif
+ }
+ return result;
+}
+
+vector<unsigned int>
+IceUtilInternal::toUTF32(const vector<Byte>& source)
+{
+ vector<unsigned int> result;
+ if(!source.empty())
+ {
+
+#ifdef ICE_HAS_CODECVT_UTF8
+ assert(sizeof(Char32T) == sizeof(unsigned int));
+
+ typedef wstring_convert<codecvt_utf8<Char32T>, Char32T> Convert;
+ Convert convert;
+
+ try
+ {
+ Convert::wide_string ws = convert.from_bytes(reinterpret_cast<const char*>(&source.front()),
+ reinterpret_cast<const char*>(&source.front() + source.size()));
+
+ result = vector<unsigned int>(reinterpret_cast<const unsigned int*>(ws.data()),
+ reinterpret_cast<const unsigned int*>(ws.data()) + ws.length());
+ }
+ catch(const std::range_error& ex)
+ {
+ throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+ }
+
+#else
+ convertUTF8ToUTF32(source, result);
+#endif
+ }
+ return result;
+}
+
+
+vector<Byte>
+IceUtilInternal::fromUTF32(const vector<unsigned int>& source)
+{
+ vector<Byte> result;
+ if(!source.empty())
+ {
+
+#ifdef ICE_HAS_CODECVT_UTF8
+ assert(sizeof(Char32T) == sizeof(unsigned int));
+
+ typedef wstring_convert<codecvt_utf8<Char32T>, Char32T> Convert;
+ Convert convert;
+
+ try
+ {
+ Convert::byte_string bs = convert.to_bytes(reinterpret_cast<const Char32T*>(&source.front()),
+ reinterpret_cast<const Char32T*>(&source.front() + source.size()));
+
+ result = vector<Byte>(reinterpret_cast<const Byte*>(bs.data()),
+ reinterpret_cast<const Byte*>(bs.data()) + bs.length());
+ }
+ catch(const std::range_error& ex)
+ {
+ throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+ }
+
+#else
+ convertUTF32ToUTF8(source, result);
+#endif
+ }
+ return result;
+}