summaryrefslogtreecommitdiff
path: root/cpp/src/IceUtil/StringConverter.cpp
diff options
context:
space:
mode:
authorJose <jose@zeroc.com>2014-05-02 19:56:38 +0200
committerJose <jose@zeroc.com>2014-05-02 19:56:38 +0200
commit1161c5817059464ab511632c0ce5d14593ced1a3 (patch)
tree51bbcdf2a4ea43c430312157350bb4271bc3f40d /cpp/src/IceUtil/StringConverter.cpp
parentUpdate .gitignore files (diff)
downloadice-1161c5817059464ab511632c0ce5d14593ced1a3.tar.bz2
ice-1161c5817059464ab511632c0ce5d14593ced1a3.tar.xz
ice-1161c5817059464ab511632c0ce5d14593ced1a3.zip
ICE-4851 - Use wstrings for input and output data that contain non-ASCII characters?
Diffstat (limited to 'cpp/src/IceUtil/StringConverter.cpp')
-rw-r--r--cpp/src/IceUtil/StringConverter.cpp478
1 files changed, 478 insertions, 0 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
new file mode 100644
index 00000000000..2a57b719695
--- /dev/null
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -0,0 +1,478 @@
+
+#include <IceUtil/StringConverter.h>
+#include <IceUtil/MutexPtrLock.h>
+#include <IceUtil/Mutex.h>
+#include <IceUtil/ScopedArray.h>
+#include <IceUtil/StringUtil.h>
+
+using namespace IceUtil;
+using namespace IceUtilInternal;
+using namespace std;
+
+namespace
+{
+
+IceUtil::Mutex* processStringConverterMutex = 0;
+IceUtil::StringConverterPtr processStringConverter;
+IceUtil::WstringConverterPtr processWstringConverter;
+
+class Init
+{
+public:
+
+ Init()
+ {
+ processStringConverterMutex = new IceUtil::Mutex;
+ }
+
+ ~Init()
+ {
+ delete processStringConverterMutex;
+ processStringConverterMutex = 0;
+ }
+};
+
+Init init;
+
+const char* __IceUtil__IllegalConversionException_name = "IceUtil::IllegalConversionException";
+
+}
+
+IllegalConversionException::IllegalConversionException(const char* file, int line) :
+ ::IceUtil::Exception(file, line)
+{
+}
+
+IllegalConversionException::IllegalConversionException(const char* file, int line, const string& reason) :
+ ::IceUtil::Exception(file, line),
+ _reason(reason)
+{
+}
+
+IllegalConversionException::~IllegalConversionException() throw()
+{
+}
+
+string
+IllegalConversionException::ice_name() const
+{
+ return __IceUtil__IllegalConversionException_name;
+}
+
+void
+IllegalConversionException::ice_print(ostream& out) const
+{
+ Exception::ice_print(out);
+ out << ": " << _reason;
+}
+
+IceUtil::IllegalConversionException*
+IllegalConversionException::ice_clone() const
+{
+ return new IllegalConversionException(*this);
+}
+
+void
+IllegalConversionException::ice_throw() const
+{
+ throw *this;
+}
+
+string
+IllegalConversionException::reason() const
+{
+ return _reason;
+}
+
+
+namespace
+{
+
+class UTF8BufferI : public UTF8Buffer
+{
+public:
+
+ UTF8BufferI() :
+ _buffer(0),
+ _offset(0)
+ {
+ }
+
+ ~UTF8BufferI()
+ {
+ free(_buffer);
+ }
+
+ Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
+ {
+ if(_buffer == 0)
+ {
+ _buffer = (Byte*)malloc(howMany);
+ }
+ else
+ {
+ assert(firstUnused != 0);
+ _offset = firstUnused - _buffer;
+ _buffer = (Byte*)realloc(_buffer, _offset + howMany);
+ }
+
+ if(!_buffer)
+ {
+ throw std::bad_alloc();
+ }
+ return _buffer + _offset;
+ }
+
+ Byte* getBuffer()
+ {
+ return _buffer;
+ }
+
+ void reset()
+ {
+ free(_buffer);
+ _buffer = 0;
+ _offset = 0;
+ }
+
+private:
+
+ IceUtil::Byte* _buffer;
+ size_t _offset;
+};
+
+}
+
+StringConverterPtr
+IceUtil::getProcessStringConverter()
+{
+ IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+ return processStringConverter;
+}
+
+void
+IceUtil::setProcessStringConverter(const StringConverterPtr& converter)
+{
+ IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+ processStringConverter = converter;
+}
+
+WstringConverterPtr
+IceUtil::getProcessWstringConverter()
+{
+ IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+ return processWstringConverter;
+}
+
+void
+IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter)
+{
+ IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+ processWstringConverter = converter;
+}
+
+string
+IceUtil::nativeToUTF8(const IceUtil::StringConverterPtr& converter, const string& str)
+{
+ if(!converter || str.empty())
+ {
+ return str;
+ }
+ UTF8BufferI buffer;
+ Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
+ return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+}
+
+string
+IceUtil::UTF8ToNative(const IceUtil::StringConverterPtr& converter, const string& str)
+{
+ if(!converter || str.empty())
+ {
+ return str;
+ }
+ string tmp;
+ converter->fromUTF8(reinterpret_cast<const Byte*>(str.data()),
+ reinterpret_cast<const Byte*>(str.data() + str.size()), tmp);
+ return tmp;
+}
+
+string
+IceUtil::wnativeToNative(const StringConverterPtr& converter, const WstringConverterPtr& wConverter, const wstring& v)
+{
+ string target;
+ if(!v.empty())
+ {
+ //
+ // First convert to UTF8 narrow string.
+ //
+ if(wConverter)
+ {
+ UTF8BufferI buffer;
+ Byte* last = wConverter->toUTF8(v.data(), v.data() + v.size(), buffer);
+ target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+ }
+ else
+ {
+ size_t size = v.size() * 4 * sizeof(char);
+
+ Byte* outBuf = new Byte[size];
+ Byte* targetStart = outBuf;
+ Byte* targetEnd = outBuf + size;
+
+ const wchar_t* sourceStart = v.data();
+
+ ConversionResult cr =
+ convertUTFWstringToUTF8(
+ sourceStart, sourceStart + v.size(),
+ targetStart, targetEnd, lenientConversion);
+
+ if(cr != conversionOK)
+ {
+ delete[] outBuf;
+ assert(cr == sourceExhausted || cr == sourceIllegal);
+ throw UTFConversionException(__FILE__, __LINE__,
+ cr == sourceExhausted ? partialCharacter : badEncoding);
+ }
+
+ target = string(reinterpret_cast<char*>(outBuf), static_cast<size_t>(targetStart - outBuf));
+ delete[] outBuf;
+ }
+
+ //
+ // If narrow string converter is present convert to the native narrow string encoding, otherwise
+ // native narrow string encoding is UTF8 and we are done.
+ //
+ if(converter)
+ {
+ string tmp;
+ converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()),
+ reinterpret_cast<const Byte*>(target.data() + target.size()), tmp);
+ tmp.swap(target);
+ }
+ }
+ return target;
+}
+
+wstring
+IceUtil::nativeToWnative(const StringConverterPtr& converter, const WstringConverterPtr& wConverter, const string& v)
+{
+ wstring target;
+ if(!v.empty())
+ {
+ //
+ // If there is a narrow string converter use it to convert to UTF8, otherwise the narrow
+ // string is already UTF8 encoded.
+ //
+ string tmp;
+ if(converter)
+ {
+ UTF8BufferI buffer;
+ Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
+ tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+ }
+ else
+ {
+ tmp = v;
+ }
+
+ //
+ // If there is a wide string converter use fromUTF8 to convert to the wide string native encoding.
+ //
+ if(wConverter)
+ {
+ wConverter->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()),
+ reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target);
+ }
+ else
+ {
+ const Byte* sourceStart = reinterpret_cast<const Byte*>(tmp.data());
+
+ ConversionResult cr =
+ convertUTF8ToUTFWstring(sourceStart, sourceStart + tmp.size(), target, lenientConversion);
+
+ if(cr != conversionOK)
+ {
+ assert(cr == sourceExhausted || cr == sourceIllegal);
+
+ throw UTFConversionException(__FILE__, __LINE__,
+ cr == sourceExhausted ? partialCharacter : badEncoding);
+ }
+ }
+ }
+ return target;
+}
+
+UnicodeWstringConverter::UnicodeWstringConverter(ConversionFlags flags) :
+ _conversionFlags(flags)
+{
+}
+
+Byte*
+UnicodeWstringConverter::toUTF8(const wchar_t* sourceStart,
+ const wchar_t* sourceEnd,
+ UTF8Buffer& buffer) const
+{
+ //
+ // The "chunk size" is the maximum of the number of characters in the
+ // source and 6 (== max bytes necessary to encode one Unicode character).
+ //
+ size_t chunkSize = std::max<size_t>(static_cast<size_t>(sourceEnd - sourceStart), 6);
+
+ Byte* targetStart = buffer.getMoreBytes(chunkSize, 0);
+ Byte* targetEnd = targetStart + chunkSize;
+
+ ConversionResult result;
+
+ while((result =
+ convertUTFWstringToUTF8(sourceStart, sourceEnd,
+ targetStart, targetEnd, _conversionFlags))
+ == targetExhausted)
+ {
+ targetStart = buffer.getMoreBytes(chunkSize, targetStart);
+ targetEnd = targetStart + chunkSize;
+ }
+
+ switch(result)
+ {
+ case conversionOK:
+ break;
+ case sourceExhausted:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source exhausted");
+ case sourceIllegal:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source illegal");
+ default:
+ {
+ assert(0);
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+ }
+ }
+ return targetStart;
+}
+
+
+void
+UnicodeWstringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
+ wstring& target) const
+{
+ if(sourceStart == sourceEnd)
+ {
+ target = L"";
+ return;
+ }
+
+ ConversionResult result =
+ convertUTF8ToUTFWstring(sourceStart, sourceEnd, target, _conversionFlags);
+
+ switch(result)
+ {
+ case conversionOK:
+ break;
+ case sourceExhausted:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source exhausted");
+ case sourceIllegal:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source illegal");
+ default:
+ {
+ assert(0);
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+ }
+ }
+}
+
+#ifdef _WIN32
+WindowsStringConverter::WindowsStringConverter(unsigned int cp) :
+ _cp(cp)
+{
+}
+
+Byte*
+WindowsStringConverter::toUTF8(const char* sourceStart,
+ const char* sourceEnd,
+ UTF8Buffer& buffer) const
+{
+ //
+ // First convert to UTF-16
+ //
+ int sourceSize = static_cast<int>(sourceEnd - sourceStart);
+ if(sourceSize == 0)
+ {
+ return buffer.getMoreBytes(1, 0);
+ }
+
+ int size = 0;
+ int writtenWchar = 0;
+ ScopedArray<wchar_t> wbuffer;
+
+ //
+ // The following code pages doesn't support MB_ERR_INVALID_CHARS flag
+ // see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx
+ //
+ DWORD flags =
+ (_cp == 50220 || _cp == 50221 || _cp == 50222 ||
+ _cp == 50225 || _cp == 50227 || _cp == 50229 ||
+ _cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS;
+
+ do
+ {
+ size = size == 0 ? sourceSize + 2 : 2 * size;
+ wbuffer.reset(new wchar_t[size]);
+
+ writtenWchar = MultiByteToWideChar(_cp, flags, sourceStart,
+ sourceSize, wbuffer.get(), size);
+ } while(writtenWchar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
+
+ if(writtenWchar == 0)
+ {
+ throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString());
+ }
+
+ //
+ // Then convert this UTF-16 wbuffer into UTF-8
+ //
+ return _unicodeWstringConverter.toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer);
+}
+
+void
+WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
+ string& target) const
+{
+ if(sourceStart == sourceEnd)
+ {
+ target = "";
+ return;
+ }
+
+ //
+ // First convert to wstring (UTF-16)
+ //
+ wstring wtarget;
+ _unicodeWstringConverter.fromUTF8(sourceStart, sourceEnd, wtarget);
+
+ //
+ // WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and
+ // 54936 (GB18030 Simplified Chinese)
+ //
+ DWORD flags = (_cp == 65001 || _cp == 54936) ? WC_ERR_INVALID_CHARS : 0;
+ //
+ // And then to a multi-byte narrow string
+ //
+ int size = 0;
+ int writtenChar = 0;
+ ScopedArray<char> buffer;
+ do
+ {
+ size = size == 0 ? static_cast<int>(sourceEnd - sourceStart) + 2 : 2 * size;
+ buffer.reset(new char[size]);
+ writtenChar = WideCharToMultiByte(_cp, flags, wtarget.data(), static_cast<int>(wtarget.size()),
+ buffer.get(), size, 0, 0);
+ } while(writtenChar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
+
+ if(writtenChar == 0)
+ {
+ throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString());
+ }
+
+ target.assign(buffer.get(), writtenChar);
+}
+
+#endif