diff options
Diffstat (limited to 'cpp/test/IceUtil/unicode/Client.cpp')
-rw-r--r-- | cpp/test/IceUtil/unicode/Client.cpp | 365 |
1 files changed, 198 insertions, 167 deletions
diff --git a/cpp/test/IceUtil/unicode/Client.cpp b/cpp/test/IceUtil/unicode/Client.cpp index 3a48db3da86..b340252d010 100644 --- a/cpp/test/IceUtil/unicode/Client.cpp +++ b/cpp/test/IceUtil/unicode/Client.cpp @@ -42,21 +42,20 @@ main(int argc, char* argv[]) if(argc > 1) { -#ifdef _WIN32 +#ifdef _WIN32 # ifdef __MINGW32__ - dir = argv[1]; + dir = argv[1]; # else - dir = IceUtil::wstringToString(argv[1]); + dir = IceUtil::wstringToString(argv[1]); # endif - dir += "\\"; + dir += "\\"; #else - dir = argv[1]; - dir += "/"; -#endif + dir = argv[1]; + dir += "/"; +#endif } - ostringstream os; os << "utf" << sizeof(wchar_t) * 8; #ifdef ICE_LITTLE_ENDIAN @@ -66,177 +65,209 @@ main(int argc, char* argv[]) #endif string wstringEncoding = os.str(); string wcoeurFile = string("coeur.") + wstringEncoding; - + { - cout << "testing UTF-8 to wstring (" << wstringEncoding << ") conversion... "; - ifstream is((dir + "coeur.utf8").c_str()); - test(is.good()); - ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary); - test(bis.good()); - - int lineNumber = 0; - - do - { - string line; - getline(is, line, '\n'); - test(isLegalUTF8Sequence(reinterpret_cast<const Byte*>(line.data()), - reinterpret_cast<const Byte*>(line.data() + line.size()))); - lineNumber++; - wstring wline = stringToWstring(line); - - for(size_t i = 0; i < wline.length(); ++i) - { - wchar_t wc = wline[i]; - const char* buffer = reinterpret_cast<char*>(&wc); - for(size_t j = 0; j < sizeof(wchar_t); ++j) - { - test(bis.good()); - char c; - bis.get(c); - if(buffer[j] != c) - { - cerr << "Error at line " << lineNumber << " column " << i << endl; - cerr << "buffer[j] == " << hex << (int)static_cast<unsigned char>(buffer[j]) << endl; - cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl; - } - test(buffer[j] == c); - } - } - // - // Skip newline character (Unix-style newline) - // - if(is.good()) - { - for(size_t j = 0; j < sizeof(wchar_t); ++j) - { - test(bis.good()); - char c; - bis.get(c); - } - } - else - { - char c; - bis.get(c); - test(bis.eof()); - } - } while(is.good()); - - cout << "ok" << endl; + cout << "testing UTF-8 to wstring (" << wstringEncoding << ") conversion... "; + ifstream is((dir + "coeur.utf8").c_str()); + test(is.good()); + ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary); + test(bis.good()); + + int lineNumber = 0; + + do + { + string line; + getline(is, line, '\n'); + lineNumber++; + wstring wline = stringToWstring(line); + + for(size_t i = 0; i < wline.length(); ++i) + { + wchar_t wc = wline[i]; + const char* buffer = reinterpret_cast<char*>(&wc); + for(size_t j = 0; j < sizeof(wchar_t); ++j) + { + test(bis.good()); + char c; + bis.get(c); + if(buffer[j] != c) + { + cerr << "Error at line " << lineNumber << " column " << i << endl; + cerr << "buffer[j] == " << hex << (int)static_cast<unsigned char>(buffer[j]) << endl; + cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl; + } + test(buffer[j] == c); + } + } + // + // Skip newline character (Unix-style newline) + // + if(is.good()) + { + for(size_t j = 0; j < sizeof(wchar_t); ++j) + { + test(bis.good()); + char c; + bis.get(c); + } + } + else + { + char c; + bis.get(c); + test(bis.eof()); + } + } while(is.good()); + + cout << "ok" << endl; + } + + { + cout << "testing wstring (" << wstringEncoding << ") to UTF-8 conversion... "; + + ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary); + test(bis.good()); + + wstring ws; + char c; + + do + { + wchar_t wc; + char* buffer = reinterpret_cast<char*>(&wc); + + for(size_t j = 0; j < sizeof(wchar_t); ++j) + { + if(!bis.good()) + { + break; + } + bis.get(c); + buffer[j] = c; + } + + if(bis.good()) + { + ws.push_back(wc); + } + } while(bis.good()); + + string s = wstringToString(ws); + + ifstream nbis((dir + "coeur.utf8").c_str(), ios_base::binary); + test(nbis.good()); + + for(size_t i = 0; i < s.size(); ++i) + { + test(nbis.good()); + nbis.get(c); + char ci = s[i]; + + if(c != ci) + { + cerr << "i == " << i << endl; + cerr << "ci == " << hex << (int)static_cast<unsigned char>(ci) << endl; + cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl; + } + test(c == s[i]); + } + test(!nbis.eof()); + nbis.get(c); + test(nbis.eof()); + + cout << "ok" << endl; } { - cout << "testing wstring (" << wstringEncoding << ") to UTF-8 conversion... "; - - ifstream bis((dir + wcoeurFile).c_str(), ios_base::binary); - test(bis.good()); - - wstring ws; - char c; - - do - { - wchar_t wc; - char* buffer = reinterpret_cast<char*>(&wc); - - for(size_t j = 0; j < sizeof(wchar_t); ++j) - { - if(!bis.good()) - { - break; - } - bis.get(c); - buffer[j] = c; - } - - if(bis.good()) - { - ws.push_back(wc); - } - } while(bis.good()); - - string s = wstringToString(ws); - - ifstream nbis((dir + "coeur.utf8").c_str(), ios_base::binary); - test(nbis.good()); - - for(size_t i = 0; i < s.size(); ++i) - { - test(nbis.good()); - nbis.get(c); - char ci = s[i]; - - if(c != ci) - { - cerr << "i == " << i << endl; - cerr << "ci == " << hex << (int)static_cast<unsigned char>(ci) << endl; - cerr << "c == " << hex << (int)static_cast<unsigned char>(c) << endl; - } - test(c == s[i]); - } - test(!nbis.eof()); - nbis.get(c); - test(nbis.eof()); - - cout << "ok" << endl; + cout << "testing wstring with surrogates... "; + + // + // Euro sign (U+20AC) is encoded with 1 UTF-16 code unit, and 3 UTF-8 code units + // U+10437 is a Deseret character, encoded with 2 UTF-16 code units, and 4 UTF-8 code units + // + wstring ws = L"\u20ac\u20ac\U00010437"; + + if(sizeof(wchar_t) == 2) + { + test(ws.length() == 4); + } + else + { + test(sizeof(wchar_t) == 4); + test(ws.length() == 3); + } + + // + // The Unicode string converter implementation allocates an initial buffer + // of size max(2 * (sourceEnd - sourceStart), 4). + // With UTF-16 encoding, that's 8 and the first 2 euros will use the first 6 + // bytes of the initial buffer. + + string ns = IceUtil::wstringToString(ws); + + test(ns.length() == 10); + test(ws == IceUtil::stringToWstring(ns)); + + cout << "ok" << endl; } { - cout << "testing error handling... "; - - // From http://stackoverflow.com/questions/1301402/example-invalid-utf8-string - - string badUTF8[] = { - "\xc3\x28", - "\xa0\xa1", - "\xe2\x28\xa1", - "\xe2\x82\x28", - "\xf0\x28\x8c\xbc", - "\xf0\x90\x28\xbc", - "\xf0\x28\x8c\x28", - "\xf8\xa1\xa1\xa1\xa1", - "\xfc\xa1\xa1\xa1\xa1\xa1", - "" - }; - - for(size_t i = 0; badUTF8[i] != ""; ++i) - { - test(isLegalUTF8Sequence(reinterpret_cast<const Byte*>(badUTF8[i].data()), - reinterpret_cast<const Byte*>(badUTF8[i].data() + badUTF8[i].size())) == false); - - try - { - wstring ws = IceUtil::stringToWstring(badUTF8[i]); - test(false); - } - catch(const IceUtil::IllegalConversionException&) - {} - } + cout << "testing error handling... "; + + // From http://stackoverflow.com/questions/1301402/example-invalid-utf8-string + + string badUTF8[] = { + "\xc3\x28", + "\xa0\xa1", + "\xe2\x28\xa1", + "\xe2\x82\x28", + "\xf0\x28\x8c\xbc", + "\xf0\x90\x28\xbc", + "\xf0\x28\x8c\x28", + "\xf8\xa1\xa1\xa1\xa1", + "\xfc\xa1\xa1\xa1\xa1\xa1", + "" + }; + + for(size_t i = 0; badUTF8[i] != ""; ++i) + { + try + { + wstring ws = IceUtil::stringToWstring(badUTF8[i]); + wcerr << L"Unexpected: " << ws << endl; + test(false); + } + catch(const IceUtil::IllegalConversionException&) + {} + } // TODO: need test for bad UTF-32 strings #ifdef _WIN32 - - wstring badWstring[] = { - wstring(1, wchar_t(0xD800)), - wstring(2, wchar_t(0xDB7F)), - L"" - }; - - for(size_t i = 0; badWstring[i] != L""; ++i) - { - try - { - string s = IceUtil::wstringToString(badWstring[i]); - test(false); - } - catch(const IceUtil::IllegalConversionException&) - {} - } + + // Note: for an unknown reason, the conversion works without + // the extra letter (x below) when using codecvt_utf8_utf16. + + wstring badWstring[] = { + wstring(1, wchar_t(0xD800)) + L"x", + wstring(2, wchar_t(0xDB7F)), + L"" + }; + + for(size_t i = 0; badWstring[i] != L""; ++i) + { + try + { + string s = IceUtil::wstringToString(badWstring[i]); + test(false); + } + catch(const IceUtil::IllegalConversionException&) + {} + } #endif - cout << "ok" << endl; - + cout << "ok" << endl; + } return EXIT_SUCCESS; } |