ICE-4851 - Use wstrings for input and output data that contain non-ASCII characters?

author: Jose <jose@zeroc.com> 2014-05-02 19:56:38 +0200
committer: Jose <jose@zeroc.com> 2014-05-02 19:56:38 +0200
commit: 1161c5817059464ab511632c0ce5d14593ced1a3 (patch)
tree: 51bbcdf2a4ea43c430312157350bb4271bc3f40d /cpp/src/IceUtil/StringConverter.cpp
parent: Update .gitignore files (diff)
download: ice-1161c5817059464ab511632c0ce5d14593ced1a3.tar.bz2
ice-1161c5817059464ab511632c0ce5d14593ced1a3.tar.xz
ice-1161c5817059464ab511632c0ce5d14593ced1a3.zip
1 files changed, 478 insertions, 0 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
new file mode 100644
index 00000000000..2a57b719695
--- /dev/null
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -0,0 +1,478 @@
+
+#include <IceUtil/StringConverter.h>
+#include <IceUtil/MutexPtrLock.h>
+#include <IceUtil/Mutex.h>
+#include <IceUtil/ScopedArray.h>
+#include <IceUtil/StringUtil.h>
+
+using namespace IceUtil;
+using namespace IceUtilInternal;
+using namespace std;
+
+namespace
+{
+
+IceUtil::Mutex* processStringConverterMutex = 0;
+IceUtil::StringConverterPtr processStringConverter;
+IceUtil::WstringConverterPtr processWstringConverter;
+
+class Init
+{
+public:
+
+    Init()
+    {
+        processStringConverterMutex = new IceUtil::Mutex;
+    }
+
+    ~Init()
+    {
+        delete processStringConverterMutex;
+        processStringConverterMutex = 0;
+    }
+};
+
+Init init;
+
+const char* __IceUtil__IllegalConversionException_name = "IceUtil::IllegalConversionException";
+
+}
+
+IllegalConversionException::IllegalConversionException(const char* file, int line) :
+    ::IceUtil::Exception(file, line)
+{
+}
+
+IllegalConversionException::IllegalConversionException(const char* file, int line, const string& reason) :
+    ::IceUtil::Exception(file, line),
+    _reason(reason)
+{
+}
+
+IllegalConversionException::~IllegalConversionException() throw()
+{
+}
+
+string
+IllegalConversionException::ice_name() const
+{
+    return __IceUtil__IllegalConversionException_name;
+}
+
+void
+IllegalConversionException::ice_print(ostream& out) const
+{
+    Exception::ice_print(out);
+    out << ": " << _reason;
+}
+
+IceUtil::IllegalConversionException*
+IllegalConversionException::ice_clone() const
+{
+    return new IllegalConversionException(*this);
+}
+
+void
+IllegalConversionException::ice_throw() const
+{
+    throw *this;
+}
+
+string
+IllegalConversionException::reason() const
+{
+    return _reason;
+}
+
+
+namespace
+{
+
+class UTF8BufferI : public UTF8Buffer
+{
+public:
+
+    UTF8BufferI() :
+        _buffer(0),
+        _offset(0)
+    {
+    }
+
+    ~UTF8BufferI()
+    {
+        free(_buffer);
+    }
+
+    Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
+    {
+        if(_buffer == 0)
+        {
+            _buffer = (Byte*)malloc(howMany);
+        }
+        else
+        {
+            assert(firstUnused != 0);
+            _offset = firstUnused - _buffer;
+            _buffer = (Byte*)realloc(_buffer, _offset + howMany);
+        }
+        
+        if(!_buffer)
+        {
+            throw std::bad_alloc();
+        }
+        return _buffer + _offset;
+    }
+
+    Byte* getBuffer()
+    {
+        return _buffer;
+    }
+
+    void reset()
+    {
+        free(_buffer);
+        _buffer = 0;
+        _offset = 0;
+    }
+    
+private:
+
+    IceUtil::Byte* _buffer;
+    size_t _offset;
+};
+
+}
+
+StringConverterPtr
+IceUtil::getProcessStringConverter()
+{
+    IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+    return processStringConverter;
+}
+
+void
+IceUtil::setProcessStringConverter(const StringConverterPtr& converter)
+{
+   IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+   processStringConverter = converter;
+}
+
+WstringConverterPtr
+IceUtil::getProcessWstringConverter()
+{
+    IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+    return processWstringConverter;
+}
+
+void
+IceUtil::setProcessWstringConverter(const WstringConverterPtr& converter)
+{
+   IceUtilInternal::MutexPtrLock<IceUtil::Mutex> lock(processStringConverterMutex);
+   processWstringConverter = converter;
+}
+
+string
+IceUtil::nativeToUTF8(const IceUtil::StringConverterPtr& converter, const string& str)
+{
+    if(!converter || str.empty())
+    {
+        return str;
+    }    
+    UTF8BufferI buffer;
+    Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
+    return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+}
+
+string
+IceUtil::UTF8ToNative(const IceUtil::StringConverterPtr& converter, const string& str)
+{
+    if(!converter || str.empty())
+    {
+        return str;
+    }
+    string tmp;
+    converter->fromUTF8(reinterpret_cast<const Byte*>(str.data()),
+                        reinterpret_cast<const Byte*>(str.data() + str.size()), tmp);
+    return tmp;
+}
+
+string
+IceUtil::wnativeToNative(const StringConverterPtr& converter, const WstringConverterPtr& wConverter, const wstring& v)
+{
+    string target;
+    if(!v.empty())
+    {
+        //
+        // First convert to UTF8 narrow string.
+        //
+        if(wConverter)
+        {
+            UTF8BufferI buffer;
+            Byte* last = wConverter->toUTF8(v.data(), v.data() + v.size(), buffer);
+            target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+        }
+        else
+        {
+            size_t size = v.size() * 4 * sizeof(char);
+
+            Byte* outBuf = new Byte[size];
+            Byte* targetStart = outBuf; 
+            Byte* targetEnd = outBuf + size;
+
+            const wchar_t* sourceStart = v.data();
+  
+            ConversionResult cr = 
+                convertUTFWstringToUTF8(
+                    sourceStart, sourceStart + v.size(), 
+                    targetStart, targetEnd, lenientConversion);
+                
+            if(cr != conversionOK)
+            {
+                delete[] outBuf;
+                assert(cr == sourceExhausted || cr == sourceIllegal);
+                throw UTFConversionException(__FILE__, __LINE__, 
+                                             cr == sourceExhausted ? partialCharacter : badEncoding);
+            }
+            
+            target = string(reinterpret_cast<char*>(outBuf), static_cast<size_t>(targetStart - outBuf));
+            delete[] outBuf;
+        }
+
+        //
+        // If narrow string converter is present convert to the native narrow string encoding, otherwise 
+        // native narrow string encoding is UTF8 and we are done.
+        //
+        if(converter)
+        {
+            string tmp;
+            converter->fromUTF8(reinterpret_cast<const Byte*>(target.data()), 
+                                reinterpret_cast<const Byte*>(target.data() + target.size()), tmp);
+            tmp.swap(target);
+        }
+    }
+    return target;
+}
+
+wstring
+IceUtil::nativeToWnative(const StringConverterPtr& converter, const WstringConverterPtr& wConverter, const string& v)
+{
+    wstring target;
+    if(!v.empty())
+    {
+        //
+        // If there is a narrow string converter use it to convert to UTF8, otherwise the narrow
+        // string is already UTF8 encoded.
+        //
+        string tmp;
+        if(converter)
+        {
+            UTF8BufferI buffer;
+            Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
+            tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+        }
+        else
+        {
+            tmp = v;
+        }
+
+        //
+        // If there is a wide string converter use fromUTF8 to convert to the wide string native encoding.
+        //
+        if(wConverter)
+        {
+            wConverter->fromUTF8(reinterpret_cast<const Byte*>(tmp.data()), 
+                                 reinterpret_cast<const Byte*>(tmp.data() + tmp.size()), target);
+        }
+        else
+        {
+            const Byte* sourceStart = reinterpret_cast<const Byte*>(tmp.data());
+            
+            ConversionResult cr = 
+                convertUTF8ToUTFWstring(sourceStart, sourceStart + tmp.size(), target, lenientConversion);
+
+            if(cr != conversionOK)
+            {
+                assert(cr == sourceExhausted || cr == sourceIllegal);
+
+                throw UTFConversionException(__FILE__, __LINE__,
+                                             cr == sourceExhausted ? partialCharacter : badEncoding);
+            }
+        }
+    }
+    return target;
+}
+
+UnicodeWstringConverter::UnicodeWstringConverter(ConversionFlags flags) :
+    _conversionFlags(flags)
+{
+}
+
+Byte* 
+UnicodeWstringConverter::toUTF8(const wchar_t* sourceStart, 
+                                const wchar_t* sourceEnd,
+                                UTF8Buffer& buffer) const
+{
+    //
+    // The "chunk size" is the maximum of the number of characters in the
+    // source and 6 (== max bytes necessary to encode one Unicode character).
+    //
+    size_t chunkSize = std::max<size_t>(static_cast<size_t>(sourceEnd - sourceStart), 6);
+
+    Byte* targetStart = buffer.getMoreBytes(chunkSize, 0);
+    Byte* targetEnd = targetStart + chunkSize;
+
+    ConversionResult result;
+
+    while((result =
+          convertUTFWstringToUTF8(sourceStart, sourceEnd, 
+                                  targetStart, targetEnd, _conversionFlags))
+          == targetExhausted)
+    {
+        targetStart = buffer.getMoreBytes(chunkSize, targetStart);
+        targetEnd = targetStart + chunkSize;
+    }
+    
+    switch(result)
+    {
+        case conversionOK:
+            break;
+        case sourceExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source exhausted");
+        case sourceIllegal:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "wide string source illegal");
+        default:
+        {
+            assert(0);
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+        }
+    }
+    return targetStart;
+}
+
+
+void 
+UnicodeWstringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
+                                  wstring& target) const
+{
+    if(sourceStart == sourceEnd)
+    {
+        target = L"";
+        return;
+    }
+
+    ConversionResult result = 
+        convertUTF8ToUTFWstring(sourceStart, sourceEnd, target, _conversionFlags);
+
+    switch(result)
+    {
+        case conversionOK:
+            break;
+        case sourceExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source exhausted");
+        case sourceIllegal:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "UTF-8 string source illegal");
+        default:
+        {
+            assert(0);
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+        }
+    }
+}
+
+#ifdef _WIN32
+WindowsStringConverter::WindowsStringConverter(unsigned int cp) :
+    _cp(cp)
+{
+}
+
+Byte*
+WindowsStringConverter::toUTF8(const char* sourceStart,
+                               const char* sourceEnd,
+                               UTF8Buffer& buffer) const
+{
+    //
+    // First convert to UTF-16
+    //
+    int sourceSize = static_cast<int>(sourceEnd - sourceStart);
+    if(sourceSize == 0)
+    {
+        return buffer.getMoreBytes(1, 0);
+    }
+
+    int size = 0;
+    int writtenWchar = 0;
+    ScopedArray<wchar_t> wbuffer;
+    
+    //
+    // The following code pages doesn't support MB_ERR_INVALID_CHARS flag
+    // see http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx
+    //
+    DWORD flags =
+        (_cp == 50220 || _cp == 50221 || _cp == 50222 ||
+         _cp == 50225 || _cp == 50227 || _cp == 50229 || 
+         _cp == 65000 || _cp == 42 || (_cp >= 57002 && _cp <= 57011)) ? 0 : MB_ERR_INVALID_CHARS;
+    
+    do
+    {
+        size = size == 0 ? sourceSize + 2 : 2 * size;
+        wbuffer.reset(new wchar_t[size]);
+
+        writtenWchar = MultiByteToWideChar(_cp, flags, sourceStart,
+                                           sourceSize, wbuffer.get(), size);
+    } while(writtenWchar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
+
+    if(writtenWchar == 0)
+    {
+        throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString());
+    }
+
+    //
+    // Then convert this UTF-16 wbuffer into UTF-8
+    //
+    return _unicodeWstringConverter.toUTF8(wbuffer.get(), wbuffer.get() + writtenWchar, buffer);
+}
+
+void
+WindowsStringConverter::fromUTF8(const Byte* sourceStart, const Byte* sourceEnd,
+                                 string& target) const
+{
+    if(sourceStart == sourceEnd)
+    {
+        target = "";
+        return;
+    }
+
+    //
+    // First convert to wstring (UTF-16)
+    //
+    wstring wtarget;
+    _unicodeWstringConverter.fromUTF8(sourceStart, sourceEnd, wtarget);
+
+    //
+    // WC_ERR_INVALID_CHARS conversion flag is only supported with 65001 (UTF-8) and
+    // 54936 (GB18030 Simplified Chinese)
+    //
+    DWORD flags = (_cp == 65001 || _cp == 54936) ? WC_ERR_INVALID_CHARS : 0;
+    //
+    // And then to a multi-byte narrow string
+    //
+    int size = 0;
+    int writtenChar = 0;
+    ScopedArray<char> buffer;
+    do
+    {
+        size = size == 0 ? static_cast<int>(sourceEnd - sourceStart) + 2 : 2 * size;
+        buffer.reset(new char[size]);
+        writtenChar = WideCharToMultiByte(_cp, flags, wtarget.data(), static_cast<int>(wtarget.size()),
+                                          buffer.get(), size, 0, 0);
+    } while(writtenChar == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
+
+    if(writtenChar == 0)
+    {
+        throw IllegalConversionException(__FILE__, __LINE__, IceUtilInternal::lastErrorToString());
+    }
+
+    target.assign(buffer.get(), writtenChar);
+}
+
+#endif
author	Jose <jose@zeroc.com>	2014-05-02 19:56:38 +0200
committer	Jose <jose@zeroc.com>	2014-05-02 19:56:38 +0200
commit	1161c5817059464ab511632c0ce5d14593ced1a3 (patch)
tree	51bbcdf2a4ea43c430312157350bb4271bc3f40d /cpp/src/IceUtil/StringConverter.cpp
parent	Update .gitignore files (diff)
download	ice-1161c5817059464ab511632c0ce5d14593ced1a3.tar.bz2 ice-1161c5817059464ab511632c0ce5d14593ced1a3.tar.xz ice-1161c5817059464ab511632c0ce5d14593ced1a3.zip