From 7ce53a467f83a526d33b39280943814f452a97a8 Mon Sep 17 00:00:00 2001 From: Bernard Normier Date: Mon, 9 Jan 2006 21:32:52 +0000 Subject: Reimplement Unicode.h ; new StringConverter class --- cpp/src/IceUtil/Unicode.cpp | 282 +++++++++++++++++++++++--------------------- 1 file changed, 148 insertions(+), 134 deletions(-) (limited to 'cpp/src/IceUtil/Unicode.cpp') diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp index 724139f25bd..287ddbe1e7a 100644 --- a/cpp/src/IceUtil/Unicode.cpp +++ b/cpp/src/IceUtil/Unicode.cpp @@ -8,151 +8,169 @@ // ********************************************************************** #include -#include - -#if defined(_WIN32) -# define SIZEOF_WCHAR_T 2 -#elif (defined(__sun) && defined(__sparc)) || \ - ((defined(__linux) || defined(__FreeBSD__)) && (defined(__i386) || defined(__x86_64))) -# define SIZEOF_WCHAR_T 4 -#endif +#include using namespace std; +using namespace IceUtil; -string -IceUtil::wstringToString(const wstring& str) +namespace +{ +// +// Helper class, base never defined +// Usage: WstringHelper::toUTF8 and fromUTF8. +// +template +struct WstringHelper { - string result; - result.reserve(str.length() * 2); + static ConversionResult toUTF8( + const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd, ConversionFlags flags); - for(unsigned int i = 0; i < str.length(); ++i) + static ConversionResult fromUTF8( + const Byte*& sourceStart, const Byte* sourceEnd, + wchar_t*& targetStart, wchar_t* targetEnd, ConversionFlags flags); +}; + +template<> +struct WstringHelper<2> +{ + static ConversionResult toUTF8( + const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd, ConversionFlags flags) { - wchar_t wc; - wc = str[i]; - - if(wc < 0x80) - { - result += static_cast(wc); - } - else if(wc < 0x800) - { - result += 0xc0 | (wc>>6); - result += 0x80 | (wc & 0x3f); - } - else if(wc < 0x10000) - { - result += 0xe0 | (wc>>12); - result += 0x80 | ((wc>>6) & 0x3f); - result += 0x80 | (wc & 0x3f); - } -#if SIZEOF_WCHAR_T >= 4 - else if(wc < 0x10FFFF) - { - result += 0xf0 | (wc>>18); - result += 0x80 | ((wc>>12) & 0x3f); - result += 0x80 | ((wc>>6) & 0x3f); - result += 0x80 | (wc & 0x3f); - } -#endif - else - { - return result; // Error, not encodable. - } + return ConvertUTF16toUTF8( + reinterpret_cast(&sourceStart), + reinterpret_cast(sourceEnd), + &targetStart, targetEnd, flags); } + + static ConversionResult fromUTF8( + const Byte*& sourceStart, const Byte* sourceEnd, + wchar_t*& targetStart, wchar_t* targetEnd, ConversionFlags flags) + { + return ConvertUTF8toUTF16( + &sourceStart, sourceEnd, + reinterpret_cast(&targetStart), + reinterpret_cast(targetEnd), flags); + } +}; - return result; +template<> +struct WstringHelper<4> +{ + static ConversionResult toUTF8( + const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd, ConversionFlags flags) + { + return ConvertUTF32toUTF8( + reinterpret_cast(&sourceStart), + reinterpret_cast(sourceEnd), + &targetStart, targetEnd, flags); + } + + static ConversionResult fromUTF8( + const Byte*& sourceStart, const Byte* sourceEnd, + wchar_t*& targetStart, wchar_t* targetEnd, ConversionFlags flags) + { + return ConvertUTF8toUTF32( + &sourceStart, sourceEnd, + reinterpret_cast(&targetStart), + reinterpret_cast(targetEnd), flags); + } +}; } -wstring -IceUtil::stringToWstring(const string& str) +// +// convertXXX functions +// + +ConversionResult +IceUtil::convertUTFWstringToUTF8( + const wchar_t*& sourceStart, const wchar_t* sourceEnd, + Byte*& targetStart, Byte* targetEnd, ConversionFlags flags) { - wstring result; - result.reserve(str.length()); + return WstringHelper::toUTF8( + sourceStart, sourceEnd, targetStart, targetEnd, flags); +} + +ConversionResult +IceUtil::convertUTF8ToUTFWstring( + const Byte*& sourceStart, const Byte* sourceEnd, + wchar_t*& targetStart, wchar_t* targetEnd, ConversionFlags flags) +{ + return WstringHelper::fromUTF8( + sourceStart, sourceEnd, targetStart, targetEnd, flags); +} - unsigned int len; - for(unsigned int i = 0; i < str.length(); i += len) +ConversionResult +IceUtil::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, + std::wstring& target, ConversionFlags flags) +{ + // + // Could be reimplemented without this temporary wchar_t buffer + // + size_t size = static_cast(sourceEnd - sourceStart); + wchar_t* outBuf = new wchar_t[size]; + wchar_t* targetStart = outBuf; + wchar_t* targetEnd = targetStart + size; + + ConversionResult result = + convertUTF8ToUTFWstring(sourceStart, sourceEnd, targetStart, + targetEnd, flags); + + if(result == conversionOK) { - unsigned char c = str[i]; - wchar_t wc; - int minval; - - if(c < 0x80) - { - wc = c; - len = 1; - minval = 0; - } - else if(c < 0xc0) // Lead byte must not be 10xxxxxx - { - return result; // Error, not encodable. - } - else if(c < 0xe0) // 110xxxxx - { - wc = c & 0x1f; - len = 2; - minval = 0x80; - } - else if(c < 0xf0) // 1110xxxx - { - wc = c & 0xf; - len = 3; - minval = 0x800; - } -#if SIZEOF_WCHAR_T >= 4 - else if(c < 0xf8) // 11110xxx - { - wc = c & 7; - len = 4; - minval = 0x10000; - } - else if(c < 0xfc) // 111110xx - { - // Length 5 and 6 is declared invalid in Unicode 3.1 and ISO 10646:2003. - wc = c & 3; - len = 5; - minval = 0x110000; - } - else if(c < 0xfe) // 1111110x - { - // Length 5 and 6 is declared invalid in Unicode 3.1 and ISO 10646:2003. - wc = c & 1; - len = 6; - minval = 0x4000000; - } -#endif - else - { - return result; // Error, not encodable. - } - - if(i + len - 1 < str.length()) - { - for(unsigned int j = 1; j < len; ++j) - { - if((str[i + j] & 0xc0) != 0x80) // All other bytes must be 10xxxxxx - { - return result; // Error, not encodable. - } - - wc <<= 6; - wc |= str[i + j] & 0x3f; - } - - if(wc < minval) - { - return result; // Error, non-shortest form. - } - else - { - result += wc; - } - } - else - { - return result; // Error, not encodable. - } + std::wstring s(outBuf, static_cast(targetStart - outBuf)); + s.swap(target); + } + delete[] outBuf; + return result; +} + + +// +// wstringToString and stringToWstring +// + +string +IceUtil::wstringToString(const wstring& wstr) +{ + string target; + + size_t size = wstr.size() * 3 * (sizeof(wchar_t) / 2); + + Byte* outBuf = new Byte[size]; + Byte* targetStart = outBuf; + Byte* targetEnd = outBuf + size; + + const wchar_t* sourceStart = wstr.data(); + + ConversionResult result = + convertUTFWstringToUTF8( + sourceStart, sourceStart + wstr.size(), + targetStart, targetEnd, lenientConversion); + + if(result == conversionOK) + { + string s(reinterpret_cast(outBuf), + static_cast(targetStart - outBuf)); + s.swap(target); } + delete[] outBuf; + return target; +} +wstring +IceUtil::stringToWstring(const string& str) +{ + wstring result; + const Byte* sourceStart = reinterpret_cast(str.data()); + + convertUTF8ToUTFWstring(sourceStart, sourceStart + str.size(), + result, lenientConversion); + // + // TODO: check the ConversionResult and do something with it! + // return result; } @@ -166,28 +184,24 @@ IceUtil::stringToWstring(const string& str) string IceUtil::wstringToString(const basic_string<__wchar_t>& str) { - assert(sizeof(__wchar_t) == SIZEOF_WCHAR_T); return wstringToString(*reinterpret_cast(&str)); } basic_string<__wchar_t> IceUtil::stringToNativeWstring(const string& str) { - assert(sizeof(__wchar_t) == SIZEOF_WCHAR_T); return reinterpret_cast& >(stringToWstring(str)); } # else string IceUtil::wstringToString(const basic_string& str) { - assert(sizeof(__wchar_t) == SIZEOF_WCHAR_T); return wstringToString(*reinterpret_cast(&str)); } basic_string IceUtil::stringToTypedefWstring(const string& str) { - assert(sizeof(__wchar_t) == SIZEOF_WCHAR_T); return reinterpret_cast& >(stringToWstring(str)); } -- cgit v1.2.3