summaryrefslogtreecommitdiff
path: root/cpp/src/IceUtil/StringUtil.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'cpp/src/IceUtil/StringUtil.cpp')
-rw-r--r--cpp/src/IceUtil/StringUtil.cpp640
1 files changed, 448 insertions, 192 deletions
diff --git a/cpp/src/IceUtil/StringUtil.cpp b/cpp/src/IceUtil/StringUtil.cpp
index ec33cf8c587..0e7d9162d6c 100644
--- a/cpp/src/IceUtil/StringUtil.cpp
+++ b/cpp/src/IceUtil/StringUtil.cpp
@@ -35,96 +35,89 @@ toOctalString(unsigned int n)
return string(s, charPos, (32 - charPos));
}
+char
+toHexDigit(Byte b)
+{
+ assert(b < 16);
+ if(b < 10)
+ {
+ return '0' + b;
+ }
+ else
+ {
+ return 'a' - 10 + b;
+ }
+}
+
+
+unsigned int
+addContinuationByte(string::iterator& p, string::iterator end, unsigned int codePoint)
+{
+ if(p == end)
+ {
+ throw IllegalArgumentException(__FILE__, __LINE__, "UTF-8 sequence too short");
+ }
+
+ Byte b = static_cast<Byte>(*p++);
+
+ if((b >> 6) != 2)
+ {
+ throw IllegalArgumentException(__FILE__, __LINE__, "Invalid UTF-8 sequence");
+ }
+ return (codePoint << 6) + (b & 0x3F);
+}
+
//
-// Write the byte b as an escape sequence if it isn't a printable ASCII
-// character and append the escape sequence to s. Additional characters
-// that should be escaped can be passed in special. If b is any of these
-// characters, b is preceded by a backslash in s.
+// Appends a 2 to 4 bytes UTF-8 sequence as a universal character name
//
void
-encodeChar(string::value_type b, string& s, const string& special)
+appendUniversalName(char c, string::iterator& p, string::iterator end, string& result)
{
- switch(b)
+ unsigned int codePoint;
+
+ Byte b = static_cast<Byte>(c);
+ if((b >> 5) == 0x06)
{
- case '\\':
- {
- s.append("\\\\");
- break;
- }
-
- case '\'':
- {
- s.append("\\'");
- break;
- }
-
- case '"':
- {
- s.append("\\\"");
- break;
- }
-
- case '\b':
- {
- s.append("\\b");
- break;
- }
-
- case '\f':
- {
- s.append("\\f");
- break;
- }
-
- case '\n':
- {
- s.append("\\n");
- break;
- }
-
- case '\r':
- {
- s.append("\\r");
- break;
- }
-
- case '\t':
+ // 2 bytes
+ codePoint = (b & 0x1F);
+ codePoint = addContinuationByte(p, end, codePoint);
+ }
+ else if((b >> 4) == 0x0E)
+ {
+ // 3 bytes
+ codePoint = (b & 0x0F);
+ codePoint = addContinuationByte(p, end, codePoint);
+ codePoint = addContinuationByte(p, end, codePoint);
+ }
+ else if((b >> 3) == 0x1E)
+ {
+ // 4 bytes
+ codePoint = (b & 0x07);
+ codePoint = addContinuationByte(p, end, codePoint);
+ codePoint = addContinuationByte(p, end, codePoint);
+ codePoint = addContinuationByte(p, end, codePoint);
+ }
+ else
+ {
+ ostringstream ostr;
+ ostr << "Invalid first byte 0x" << hex << static_cast<unsigned short>(b) << " in UTF-8 sequence" << endl;
+ throw IllegalArgumentException(__FILE__, __LINE__, ostr.str());
+ }
+
+ if(codePoint > 0xFFFF)
+ {
+ result.append("\\U");
+ for(int j = 7; j >= 0; j--)
{
- s.append("\\t");
- break;
+ result.push_back(toHexDigit(static_cast<Byte>((codePoint >> (j * 4)) & 0x0F)));
}
-
- default:
+ }
+ else
+ {
+ result.append("\\u");
+ for(int j = 3; j >= 0; j--)
{
- unsigned char i = static_cast<unsigned char>(b);
- if(!(i >= 32 && i <= 126))
- {
- s.push_back('\\');
- string octal = toOctalString(i);
- //
- // Add leading zeroes so that we avoid problems during
- // decoding. For example, consider the escaped string
- // \0013 (i.e., a character with value 1 followed by the
- // character '3'). If the leading zeroes were omitted, the
- // result would be incorrectly interpreted as a single
- // character with value 11.
- //
- for(string::size_type j = octal.size(); j < 3; j++)
- {
- s.push_back('0');
- }
- s.append(octal);
- }
- else if(special.find(b) != string::npos)
- {
- s.push_back('\\');
- s.push_back(b);
- }
- else
- {
- s.push_back(b);
- }
- break;
+ result.push_back(toHexDigit(static_cast<Byte>((codePoint >> (j * 4)) & 0x0F)));
}
}
}
@@ -132,27 +125,153 @@ encodeChar(string::value_type b, string& s, const string& special)
}
//
-// Add escape sequences (such as "\n", or "\007") to make a string
-// readable in ASCII. Any characters that appear in special are
-// prefixed with a backslash in the returned string.
+// Add escape sequences. Any characters that appear in special are prefixed with a backslash in the returned string.
//
string
-IceUtilInternal::escapeString(const string& s, const string& special)
+IceUtilInternal::escapeString(const string& s, const string& special, ToStringMode toStringMode)
{
for(string::size_type i = 0; i < special.size(); ++i)
{
if(static_cast<unsigned char>(special[i]) < 32 || static_cast<unsigned char>(special[i]) > 126)
{
- throw IllegalArgumentException(__FILE__, __LINE__, "special characters must be in ASCII range 32-126");
+ throw IllegalArgumentException(__FILE__, __LINE__, "Special characters must be in ASCII range 32-126");
}
}
-
+
+ //
+ // First convert to UTF-8
+ //
+ string u8s = nativeToUTF8(s, getProcessStringConverter());
+
+ string::iterator p = u8s.begin();
+
string result;
- for(string::size_type i = 0; i < s.size(); ++i)
+
+ while(p != u8s.end())
{
- encodeChar(s[i], result, special);
+ char c = *p++;
+
+ switch(c)
+ {
+ case '\\':
+ {
+ result.append("\\\\");
+ break;
+ }
+
+ case '\'':
+ {
+ result.append("\\'");
+ break;
+ }
+
+ case '"':
+ {
+ result.append("\\\"");
+ break;
+ }
+
+ case '\b':
+ {
+ result.append("\\b");
+ break;
+ }
+
+ case '\f':
+ {
+ result.append("\\f");
+ break;
+ }
+
+ case '\n':
+ {
+ result.append("\\n");
+ break;
+ }
+
+ case '\r':
+ {
+ result.append("\\r");
+ break;
+ }
+
+ case '\t':
+ {
+ result.append("\\t");
+ break;
+ }
+
+ default:
+ {
+ if(special.find(c) != string::npos)
+ {
+ result.push_back('\\');
+ result.push_back(c);
+ }
+ else
+ {
+ unsigned char i = static_cast<unsigned char>(c);
+
+ if(i < 32 || i > 126)
+ {
+ if(toStringMode == ICE_ENUM(ToStringMode, Compat))
+ {
+ // append octal string
+
+ result.push_back('\\');
+ string octal = toOctalString(i);
+ //
+ // Add leading zeroes so that we avoid problems during
+ // decoding. For example, consider the escaped string
+ // \0013 (i.e., a character with value 1 followed by the
+ // character '3'). If the leading zeroes were omitted, the
+ // result would be incorrectly interpreted as a single
+ // character with value 11.
+ //
+ for(string::size_type j = octal.size(); j < 3; j++)
+ {
+ result.push_back('0');
+ }
+ result.append(octal);
+ }
+ else if(i < 32 || i == 127)
+ {
+ // append \u00nn
+ result.append("\\u00");
+ result.push_back(toHexDigit(i >> 4));
+ result.push_back(toHexDigit(i & 0x0F));
+ }
+ else if(toStringMode == ICE_ENUM(ToStringMode, ASCII))
+ {
+ // append \unnnn or \Unnnnnnnn after reading more UTF-8 bytes
+ appendUniversalName(c, p, u8s.end(), result);
+ }
+ else
+ {
+ // keep as is
+ result.push_back(c);
+ }
+ }
+ else
+ {
+ // printable ASCII character
+ result.push_back(c);
+ }
+ }
+ break;
+ }
+ }
}
-
+
+ if(toStringMode == ICE_ENUM(ToStringMode, Unicode))
+ {
+ //
+ // Convert back to Native
+ //
+ result = UTF8ToNative(result, getProcessStringConverter());
+ }
+ // else it's a pure ASCII string
+
return result;
}
@@ -163,7 +282,7 @@ char
checkChar(const string& s, string::size_type pos)
{
unsigned char c = static_cast<unsigned char>(s[pos]);
- if(!(c >= 32 && c <= 126))
+ if(c < 32 || c == 127)
{
ostringstream ostr;
if(pos > 0)
@@ -174,29 +293,74 @@ checkChar(const string& s, string::size_type pos)
{
ostr << "first character";
}
- ostr << " is not a printable ASCII character (ordinal " << static_cast<int>(c) << ")";
+ ostr << " has invalid ordinal value " << static_cast<int>(c);
throw IllegalArgumentException(__FILE__, __LINE__, ostr.str());
}
return c;
}
//
-// Decode the character or escape sequence starting at start and return it.
+// Append codePoint as a UTF-8 sequence
+//
+void
+appendUTF8(unsigned int codePoint, bool inBMP, string& result)
+{
+ if(inBMP && codePoint >= 0xD800 && codePoint <= 0xDFFF)
+ {
+ throw IllegalArgumentException(__FILE__, __LINE__,
+ "A non-BMP character cannot be encoded with \\unnnn, use \\Unnnnnnnn instead");
+ }
+
+ if(codePoint <= 0x7F)
+ {
+ // ASCII
+ result.push_back(static_cast<char>(codePoint));
+ }
+ else if(codePoint <= 0x7FF)
+ {
+ // 2 bytes
+ result.push_back(static_cast<char>((codePoint >> 6) | 0xC0));
+ result.push_back(static_cast<char>((codePoint & 0x3F) | 0x80));
+ }
+ else if(codePoint <= 0xFFFF)
+ {
+ // 3 bytes
+ result.push_back(static_cast<char>((codePoint >> 12) | 0xE0));
+ result.push_back(static_cast<char>(((codePoint >> 6) & 0x3F) | 0x80));
+ result.push_back(static_cast<char>((codePoint & 0x3F) | 0x80));
+ }
+ else if(codePoint <= 0x10FFFF)
+ {
+ // 4 bytes
+ result.push_back(static_cast<char>((codePoint >> 18) | 0xF0));
+ result.push_back(static_cast<char>(((codePoint >> 12) & 0x3F) | 0x80));
+ result.push_back(static_cast<char>(((codePoint >> 6) & 0x3F) | 0x80));
+ result.push_back(static_cast<char>((codePoint & 0x3F) | 0x80));
+ }
+ else
+ {
+ throw IllegalArgumentException(__FILE__, __LINE__, "Invalid universal character name");
+ }
+}
+
+//
+// Decode the character or escape sequence starting at start and appends it to result;
// end marks the one-past-the-end position of the substring to be scanned.
// nextStart is set to the index of the first character following the decoded
// character or escape sequence.
//
-char
-decodeChar(const string& s, string::size_type start, string::size_type end, string::size_type& nextStart)
+bool
+decodeChar(const string& s, string::size_type start, string::size_type end, string::size_type& nextStart,
+ string& result)
{
assert(start < end);
assert(end <= s.size());
- char c;
+ bool pureASCII = true;
if(s[start] != '\\')
{
- c = checkChar(s, start++);
+ result.push_back(checkChar(s, start++));
}
else
{
@@ -204,43 +368,90 @@ decodeChar(const string& s, string::size_type start, string::size_type end, stri
{
throw IllegalArgumentException(__FILE__, __LINE__, "trailing backslash");
}
- switch(s[++start])
+
+ char c = s[++start];
+
+ switch(c)
{
- case '\\':
- case '\'':
- case '"':
+ case '\\':
+ case '\'':
+ case '"':
+ {
+ ++start;
+ result.push_back(c);
+ break;
+ }
+ case 'b':
{
- c = s[start++];
+ ++start;
+ result.push_back('\b');
break;
}
- case 'b':
+ case 'f':
{
++start;
- c = '\b';
+ result.push_back('\f');
break;
}
- case 'f':
+ case 'n':
{
++start;
- c = '\f';
+ result.push_back('\n');
break;
}
- case 'n':
+ case 'r':
{
++start;
- c = '\n';
+ result.push_back('\r');
break;
}
- case 'r':
+ case 't':
{
++start;
- c = '\r';
+ result.push_back('\t');
break;
}
- case 't':
+ case 'u':
+ case 'U':
{
+ unsigned int codePoint = 0;
+ bool inBMP = (c == 'u');
+ int size = inBMP ? 4 : 8;
++start;
- c = '\t';
+ while(size > 0 && start < end)
+ {
+ c = s[start++];
+ int charVal = 0;
+ if(c >= '0' && c <= '9')
+ {
+ charVal = c - '0';
+ }
+ else if(c >= 'a' && c <= 'f')
+ {
+ charVal = 10 + (c - 'a');
+ }
+ else if(c >= 'A' && c <= 'F')
+ {
+ charVal = 10 + (c - 'A');
+ }
+ else
+ {
+ break; // while
+ }
+ codePoint = codePoint * 16 + static_cast<unsigned int>(charVal);
+ --size;
+ }
+ if(size > 0)
+ {
+ throw IllegalArgumentException(__FILE__, __LINE__,
+ "Invalid universal character name: too few hex digits");
+ }
+
+ appendUTF8(codePoint, inBMP, result);
+ if(codePoint > 127)
+ {
+ pureASCII = false;
+ }
break;
}
case '0':
@@ -269,31 +480,26 @@ decodeChar(const string& s, string::size_type start, string::size_type end, stri
ostr << "octal value \\" << oct << val << dec << " (" << val << ") is out of range";
throw IllegalArgumentException(__FILE__, __LINE__, ostr.str());
}
- c = static_cast<char>(val);
+ result.push_back(static_cast<char>(val));
+ if(val > 127)
+ {
+ pureASCII = false;
+ }
break;
}
default:
{
- c = checkChar(s, start++);
+ if(static_cast<unsigned char>(c) > 127)
+ {
+ pureASCII = false;
+ }
+ result.push_back(checkChar(s, start++));
break;
}
}
}
nextStart = start;
- return c;
-}
-
-//
-// Remove escape sequences from s and append the result to sb.
-// Return true if successful, false otherwise.
-//
-void
-decodeString(const string& s, string::size_type start, string::size_type end, string& sb)
-{
- while(start < end)
- {
- sb.push_back(decodeChar(s, start, end, start));
- }
+ return pureASCII;
}
}
@@ -306,11 +512,61 @@ IceUtilInternal::unescapeString(const string& s, string::size_type start, string
{
assert(start <= end && end <= s.size());
- string result;
- result.reserve(end - start);
- result.clear();
- decodeString(s, start, end, result);
- return result;
+ // Optimization for strings without escapes
+ string::size_type p = s.find('\\', start);
+ if(p == string::npos || p >= end)
+ {
+ p = start;
+ while(p < end)
+ {
+ checkChar(s, p++);
+ }
+ return s.substr(start, end);
+ }
+ else
+ {
+ StringConverterPtr stringConverter = getProcessStringConverter();
+
+ const string* inputStringPtr = &s;
+ string u8s;
+
+ if(stringConverter)
+ {
+ bool inputIsPureASCII = true;
+ string::size_type i = start;
+ while(i < end && inputIsPureASCII)
+ {
+ inputIsPureASCII = static_cast<unsigned char>(s[i++]) <= 127;
+ }
+
+ if(!inputIsPureASCII)
+ {
+ u8s = nativeToUTF8(s.substr(start, end), stringConverter);
+ inputStringPtr = &u8s;
+ start = 0;
+ end = u8s.size();
+ }
+ }
+
+ bool resultIsPureASCII = true;
+ string result;
+ result.reserve(end - start);
+ while(start < end)
+ {
+ if(decodeChar(*inputStringPtr, start, end, start, result))
+ {
+ resultIsPureASCII = false;
+ }
+ }
+
+ if(stringConverter && !resultIsPureASCII)
+ {
+ // Need to convert from UTF-8 to Native
+ result = UTF8ToNative(result, stringConverter);
+ }
+
+ return result;
+ }
}
bool
@@ -328,7 +584,7 @@ IceUtilInternal::splitString(const string& str, const string& delim, vector<stri
quoteChar = str[pos++];
continue; // Skip the quote
}
- else if(quoteChar == '\0' && str[pos] == '\\' && pos + 1 < length &&
+ else if(quoteChar == '\0' && str[pos] == '\\' && pos + 1 < length &&
(str[pos + 1] == '\'' || str[pos + 1] == '"'))
{
++pos;
@@ -356,7 +612,7 @@ IceUtilInternal::splitString(const string& str, const string& delim, vector<stri
continue;
}
}
-
+
if(pos < length)
{
elt += str[pos++];
@@ -437,7 +693,7 @@ IceUtilInternal::checkQuote(const string& s, string::size_type start)
//
// Match `s' against the pattern `pat'. A * in the pattern acts
// as a wildcard: it matches any non-empty sequence of characters.
-// We match by hand here because it's portable across platforms
+// We match by hand here because it's portable across platforms
// (whereas regex() isn't). Only one * per pattern is supported.
//
bool
@@ -500,7 +756,7 @@ IceUtilInternal::errorToString(int error, LPCVOID source)
wstring lpMsgBuf(256, wchar_t());
DWORD stored = 0;
-
+
while(stored == 0)
{
stored = FormatMessageW(
@@ -531,7 +787,7 @@ IceUtilInternal::errorToString(int error, LPCVOID source)
else
{
break;
- }
+ }
}
}
@@ -539,7 +795,7 @@ IceUtilInternal::errorToString(int error, LPCVOID source)
#else
LPWSTR msg = 0;
-
+
DWORD stored = FormatMessageW(
FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
@@ -552,7 +808,7 @@ IceUtilInternal::errorToString(int error, LPCVOID source)
0,
NULL);
#endif
-
+
if(stored > 0)
{
assert(msg && wcslen(msg) > 0);
@@ -576,159 +832,159 @@ IceUtilInternal::errorToString(int error, LPCVOID source)
return os.str();
}
}
-
+
switch(error)
{
case WSAEINTR:
return "WSAEINTR";
-
+
case WSAEBADF:
return "WSAEBADF";
-
+
case WSAEACCES:
return "WSAEACCES";
-
+
case WSAEFAULT:
return "WSAEFAULT";
-
+
case WSAEINVAL:
return "WSAEINVAL";
-
+
case WSAEMFILE:
return "WSAEMFILE";
-
+
case WSAEWOULDBLOCK:
return "WSAEWOULDBLOCK";
-
+
case WSAEINPROGRESS:
return "WSAEINPROGRESS";
-
+
case WSAEALREADY:
return "WSAEALREADY";
-
+
case WSAENOTSOCK:
return "WSAENOTSOCK";
-
+
case WSAEDESTADDRREQ:
return "WSAEDESTADDRREQ";
-
+
case WSAEMSGSIZE:
return "WSAEMSGSIZE";
-
+
case WSAEPROTOTYPE:
return "WSAEPROTOTYPE";
-
+
case WSAENOPROTOOPT:
return "WSAENOPROTOOPT";
-
+
case WSAEPROTONOSUPPORT:
return "WSAEPROTONOSUPPORT";
-
+
case WSAESOCKTNOSUPPORT:
return "WSAESOCKTNOSUPPORT";
-
+
case WSAEOPNOTSUPP:
return "WSAEOPNOTSUPP";
-
+
case WSAEPFNOSUPPORT:
return "WSAEPFNOSUPPORT";
-
+
case WSAEAFNOSUPPORT:
return "WSAEAFNOSUPPORT";
-
+
case WSAEADDRINUSE:
return "WSAEADDRINUSE";
-
+
case WSAEADDRNOTAVAIL:
return "WSAEADDRNOTAVAIL";
-
+
case WSAENETDOWN:
return "WSAENETDOWN";
-
+
case WSAENETUNREACH:
return "WSAENETUNREACH";
-
+
case WSAENETRESET:
return "WSAENETRESET";
-
+
case WSAECONNABORTED:
return "WSAECONNABORTED";
-
+
case WSAECONNRESET:
return "WSAECONNRESET";
-
+
case WSAENOBUFS:
return "WSAENOBUFS";
-
+
case WSAEISCONN:
return "WSAEISCONN";
-
+
case WSAENOTCONN:
return "WSAENOTCONN";
-
+
case WSAESHUTDOWN:
return "WSAESHUTDOWN";
-
+
case WSAETOOMANYREFS:
return "WSAETOOMANYREFS";
-
+
case WSAETIMEDOUT:
return "WSAETIMEDOUT";
-
+
case WSAECONNREFUSED:
return "WSAECONNREFUSED";
-
+
case WSAELOOP:
return "WSAELOOP";
-
+
case WSAENAMETOOLONG:
return "WSAENAMETOOLONG";
-
+
case WSAEHOSTDOWN:
return "WSAEHOSTDOWN";
-
+
case WSAEHOSTUNREACH:
return "WSAEHOSTUNREACH";
-
+
case WSAENOTEMPTY:
return "WSAENOTEMPTY";
-
+
case WSAEPROCLIM:
return "WSAEPROCLIM";
-
+
case WSAEUSERS:
return "WSAEUSERS";
-
+
case WSAEDQUOT:
return "WSAEDQUOT";
-
+
case WSAESTALE:
return "WSAESTALE";
-
+
case WSAEREMOTE:
return "WSAEREMOTE";
-
+
case WSAEDISCON:
return "WSAEDISCON";
-
+
case WSASYSNOTREADY:
return "WSASYSNOTREADY";
-
+
case WSAVERNOTSUPPORTED:
return "WSAVERNOTSUPPORTED";
-
+
case WSANOTINITIALISED:
return "WSANOTINITIALISED";
-
+
case WSAHOST_NOT_FOUND:
return "WSAHOST_NOT_FOUND";
-
+
case WSATRY_AGAIN:
return "WSATRY_AGAIN";
-
+
case WSANO_RECOVERY:
return "WSANO_RECOVERY";
-
+
case WSANO_DATA:
return "WSANO_DATA";