summaryrefslogtreecommitdiff
path: root/csharp/src/Ice/StringUtil.cs
diff options
context:
space:
mode:
Diffstat (limited to 'csharp/src/Ice/StringUtil.cs')
-rw-r--r--csharp/src/Ice/StringUtil.cs407
1 files changed, 273 insertions, 134 deletions
diff --git a/csharp/src/Ice/StringUtil.cs b/csharp/src/Ice/StringUtil.cs
index d683cc8b92f..c8c392be86d 100644
--- a/csharp/src/Ice/StringUtil.cs
+++ b/csharp/src/Ice/StringUtil.cs
@@ -25,7 +25,7 @@ namespace IceUtilInternal
{
return findFirstOf(str, match, 0);
}
-
+
//
// Return the index of the first character in str to
// appear in match, starting from start. Returns -1 if none is
@@ -42,10 +42,10 @@ namespace IceUtilInternal
return i;
}
}
-
+
return -1;
}
-
+
//
// Return the index of the first character in str which does
// not appear in match, starting from 0. Returns -1 if none is
@@ -55,7 +55,7 @@ namespace IceUtilInternal
{
return findFirstNotOf(str, match, 0);
}
-
+
//
// Return the index of the first character in str which does
// not appear in match, starting from start. Returns -1 if none is
@@ -72,114 +72,135 @@ namespace IceUtilInternal
return i;
}
}
-
+
return -1;
}
-
- //
- // Write the byte b as an escape sequence if it isn't a printable ASCII
- // character and append the escape sequence to sb. Additional characters
- // that should be escaped can be passed in special. If b is any of these
- // characters, b is preceded by a backslash in sb.
- //
- private static void encodeChar(byte b, StringBuilder sb, string special)
+
+ private static void
+ encodeChar(char c, StringBuilder sb, string special, Ice.ToStringMode toStringMode)
{
- switch((char)b)
+ switch(c)
{
- case '\\':
+ case '\\':
{
sb.Append("\\\\");
break;
}
-
- case '\'':
+
+ case '\'':
{
sb.Append("\\'");
break;
}
-
- case '"':
+
+ case '"':
{
sb.Append("\\\"");
break;
}
-
- case '\b':
+
+ case '\b':
{
sb.Append("\\b");
break;
}
-
- case '\f':
+
+ case '\f':
{
sb.Append("\\f");
break;
}
-
- case '\n':
+
+ case '\n':
{
sb.Append("\\n");
break;
}
-
- case '\r':
+
+ case '\r':
{
sb.Append("\\r");
break;
}
-
- case '\t':
+
+ case '\t':
{
sb.Append("\\t");
break;
}
-
- default:
+ default:
{
- if(!(b >= 32 && b <= 126))
+ if(special != null && special.IndexOf(c) != -1)
{
sb.Append('\\');
- string octal = System.Convert.ToString(b, 8);
- //
- // Add leading zeroes so that we avoid problems during
- // decoding. For example, consider the encoded string
- // \0013 (i.e., a character with value 1 followed by
- // the character '3'). If the leading zeroes were omitted,
- // the result would be incorrectly interpreted by the
- // decoder as a single character with value 11.
- //
- for(int j = octal.Length; j < 3; j++)
- {
- sb.Append('0');
- }
- sb.Append(octal);
- }
- else if(special != null && special.IndexOf((char)b) != -1)
- {
- sb.Append('\\');
- sb.Append((char)b);
+ sb.Append(c);
}
else
{
- sb.Append((char)b);
+ int i = (int)c;
+ if(i < 32 || i > 126)
+ {
+ if(toStringMode == Ice.ToStringMode.Compat)
+ {
+ //
+ // When ToStringMode=Compat, c is a UTF-8 byte
+ //
+ Debug.Assert(i < 256);
+
+ sb.Append('\\');
+ string octal = System.Convert.ToString(i, 8);
+ //
+ // Add leading zeroes so that we avoid problems during
+ // decoding. For example, consider the encoded string
+ // \0013 (i.e., a character with value 1 followed by
+ // the character '3'). If the leading zeroes were omitted,
+ // the result would be incorrectly interpreted by the
+ // decoder as a single character with value 11.
+ //
+ for(int j = octal.Length; j < 3; j++)
+ {
+ sb.Append('0');
+ }
+ sb.Append(octal);
+ }
+ else if(i < 32 || i == 127 || toStringMode == Ice.ToStringMode.ASCII)
+ {
+ // append \\unnnn
+ sb.Append("\\u");
+ string hex = System.Convert.ToString(i, 16);
+ for(int j = hex.Length; j < 4; j++)
+ {
+ sb.Append('0');
+ }
+ sb.Append(hex);
+ }
+ else
+ {
+ // keep as is
+ sb.Append(c);
+ }
+ }
+ else
+ {
+ // printable ASCII character
+ sb.Append(c);
+ }
}
- }
break;
+ }
}
}
-
+
//
- // Add escape sequences (such as "\n", or "\007") to make a string
- // readable in ASCII. Any characters that appear in special are
- // prefixed with a backslash in the returned string.
+ // Add escape sequences (such as "\n", or "\007") to the input string
//
- public static string escapeString(string s, string special)
+ public static string escapeString(string s, string special, Ice.ToStringMode toStringMode)
{
if(special != null)
{
for(int i = 0; i < special.Length; ++i)
{
- if((int)special[i] < 32 || (int)special[i] > 126)
+ if(special[i] < 32 || special[i] > 126)
{
throw new System.ArgumentException("special characters must be in ASCII range 32-126",
"special");
@@ -187,22 +208,64 @@ namespace IceUtilInternal
}
}
- UTF8Encoding utf8 = new UTF8Encoding();
- byte[] bytes = utf8.GetBytes(s);
+ if(toStringMode == Ice.ToStringMode.Compat)
+ {
+ // Encode UTF-8 bytes
+
+ UTF8Encoding utf8 = new UTF8Encoding();
+ byte[] bytes = utf8.GetBytes(s);
- StringBuilder result = new StringBuilder(bytes.Length);
- for(int i = 0; i < bytes.Length; i++)
+ StringBuilder result = new StringBuilder(bytes.Length);
+ for(int i = 0; i < bytes.Length; i++)
+ {
+ encodeChar((char)bytes[i], result, special, toStringMode);
+ }
+
+ return result.ToString();
+ }
+ else
{
- encodeChar(bytes[i], result, special);
+ StringBuilder result = new StringBuilder(s.Length);
+
+ for(int i = 0; i < s.Length; i++)
+ {
+ char c = s[i];
+ if(toStringMode == Ice.ToStringMode.Unicode || !System.Char.IsSurrogate(c))
+ {
+ encodeChar(c, result, special, toStringMode);
+ }
+ else
+ {
+ Debug.Assert(toStringMode == Ice.ToStringMode.ASCII && System.Char.IsSurrogate(c));
+ if(i + 1 == s.Length)
+ {
+ throw new System.ArgumentException("High surrogate without low surrogate");
+ }
+ else
+ {
+ i++;
+ int codePoint = System.Char.ConvertToUtf32(c, s[i]);
+ // append \Unnnnnnnn
+ result.Append("\\U");
+ string hex = System.Convert.ToString(codePoint, 16);
+ for(int j = hex.Length; j < 8; j++)
+ {
+ result.Append('0');
+ }
+ result.Append(hex);
+ }
+ }
+ }
+
+ return result.ToString();
}
-
- return result.ToString();
}
-
- private static char checkChar(string s, int pos)
+
+ private static char
+ checkChar(string s, int pos)
{
char c = s[pos];
- if(!(c >= 32 && c <= 126))
+ if(c < 32 || c == 127)
{
string msg;
if(pos > 0)
@@ -219,23 +282,22 @@ namespace IceUtilInternal
return c;
}
+
//
- // Decode the character or escape sequence starting at start and return it.
- // end marks the one-past-the-end position of the substring to be scanned.
- // nextStart is set to the index of the first character following the decoded
- // character or escape sequence.
+ // Decode the character or escape sequence starting at start and appends it to result;
+ // returns the index of the first character following the decoded character
+ // or escape sequence.
//
- private static char decodeChar(string s, int start, int end, out int nextStart)
+ private static int
+ decodeChar(string s, int start, int end, StringBuilder result, UTF8Encoding utf8Encoding)
{
Debug.Assert(start >= 0);
Debug.Assert(start < end);
Debug.Assert(end <= s.Length);
- char c;
-
if(s[start] != '\\')
{
- c = checkChar(s, start++);
+ result.Append(checkChar(s, start++));
}
else
{
@@ -243,45 +305,98 @@ namespace IceUtilInternal
{
throw new System.ArgumentException("trailing backslash");
}
- switch(s[++start])
+
+ char c = s[++start];
+
+ switch(c)
{
- case '\\':
- case '\'':
- case '"':
+ case '\\':
+ case '\'':
+ case '"':
+ {
+ ++start;
+ result.Append(c);
+ break;
+ }
+ case 'b':
{
- c = s[start++];
+ ++start;
+ result.Append('\b');
break;
}
- case 'b':
+ case 'f':
{
++start;
- c = '\b';
+ result.Append('\f');
break;
}
- case 'f':
+ case 'n':
{
++start;
- c = '\f';
+ result.Append('\n');
break;
}
- case 'n':
+ case 'r':
{
++start;
- c = '\n';
+ result.Append('\r');
break;
}
- case 'r':
+ case 't':
{
++start;
- c = '\r';
+ result.Append('\t');
break;
}
- case 't':
+ case 'u':
+ case 'U':
{
+ int codePoint = 0;
+ bool inBMP = (c == 'u');
+ int size = inBMP ? 4 : 8;
++start;
- c = '\t';
+ while(size > 0 && start < end)
+ {
+ c = s[start++];
+ int charVal = 0;
+ if(c >= '0' && c <= '9')
+ {
+ charVal = c - '0';
+ }
+ else if(c >= 'a' && c <= 'f')
+ {
+ charVal = 10 + (c - 'a');
+ }
+ else if(c >= 'A' && c <= 'F')
+ {
+ charVal = 10 + (c - 'A');
+ }
+ else
+ {
+ break; // while
+ }
+ codePoint = codePoint * 16 + charVal;
+ --size;
+ }
+ if(size > 0)
+ {
+ throw new System.ArgumentException("Invalid universal character name: too few hex digits");
+ }
+ if(inBMP && System.Char.IsSurrogate((char)codePoint))
+ {
+ throw new System.ArgumentException("A non-BMP character cannot be encoded with \\unnnn, use \\Unnnnnnnn instead");
+ }
+ if(inBMP || codePoint <= 0xFFFF)
+ {
+ result.Append((char)codePoint);
+ }
+ else
+ {
+ result.Append(System.Char.ConvertFromUtf32(codePoint));
+ }
break;
}
+
case '0':
case '1':
case '2':
@@ -291,47 +406,63 @@ namespace IceUtilInternal
case '6':
case '7':
{
- int val = 0;
- for(int j = 0; j < 3 && start < end; ++j)
+ // UTF-8 byte sequence encoded with octal escapes
+
+ byte[] arr = new byte[end - start];
+ int i = 0;
+ bool done = false;
+ while(!done)
{
- int charVal = s[start++] - '0';
- if(charVal < 0 || charVal > 7)
+ int val = 0;
+ for(int j = 0; j < 3 && start < end; ++j)
{
- --start;
- break;
+ int charVal = s[start++] - '0';
+ if(charVal < 0 || charVal > 7)
+ {
+ --start;
+ if(j == 0)
+ {
+ // first character after escape is not 0-7:
+ done = true;
+ --start; // go back to the previous backslash
+ }
+ break; // for
+ }
+ val = val * 8 + charVal;
+ }
+
+ if(!done)
+ {
+ if(val > 255)
+ {
+ string msg = "octal value \\" + System.Convert.ToString(val, 8) + " (" + val + ") is out of range";
+ throw new System.ArgumentException(msg);
+ }
+ arr[i++] = (byte)val;
+
+ if((start + 1 < end) && s[start] == '\\')
+ {
+ start++;
+ // loop, read next octal escape sequence
+ }
+ else
+ {
+ done = true;
+ }
}
- val = val * 8 + charVal;
- }
- if(val > 255)
- {
- string msg = "octal value \\" + System.Convert.ToString(val, 8) + " (" + val +
- ") is out of range";
- throw new System.ArgumentException(msg, "s");
}
- c = System.Convert.ToChar(val);
+
+ result.Append(utf8Encoding.GetString(arr, 0, i)); // May raise ArgumentException.
break;
}
default:
{
- c = checkChar(s, start++);
+ result.Append(checkChar(s, start++));
break;
}
}
}
- nextStart = start;
- return c;
- }
-
- //
- // Remove escape sequences from s and append the result to sb.
- // Return true if successful, false otherwise.
- //
- private static void decodeString(string s, int start, int end, StringBuilder sb)
- {
- while(start < end)
- {
- sb.Append(decodeChar(s, start, end, out start));
- }
+ return start;
}
//
@@ -342,18 +473,26 @@ namespace IceUtilInternal
{
Debug.Assert(start >= 0 && start <= end && end <= s.Length);
- StringBuilder sb = new StringBuilder();
- decodeString(s, start, end, sb);
- string decodedString = sb.ToString();
-
- byte[] arr = new byte[decodedString.Length];
- for(int i = 0; i < arr.Length; ++i)
+ // Optimization for strings without escapes
+ if(start == end || s.IndexOf('\\', start, end - start) == -1)
{
- arr[i] = (byte)decodedString[i];
+ int p = start;
+ while(p < end)
+ {
+ checkChar(s, p++);
+ }
+ return s.Substring(start, end - start);
+ }
+ else
+ {
+ StringBuilder sb = new StringBuilder(end - start);
+ UTF8Encoding utf8Encoding = new UTF8Encoding(false, true);
+ while(start < end)
+ {
+ start = decodeChar(s, start, end, sb, utf8Encoding);
+ }
+ return sb.ToString();
}
-
- UTF8Encoding utf8 = new UTF8Encoding(false, true);
- return utf8.GetString(arr, 0, arr.Length); // May raise ArgumentException.
}
//
@@ -402,7 +541,7 @@ namespace IceUtilInternal
continue;
}
}
-
+
if(pos < str.Length)
{
arr[n++] = str[pos++];
@@ -425,7 +564,7 @@ namespace IceUtilInternal
{
return checkQuote(s, 0);
}
-
+
//
// If a single or double quotation mark is found at the start position,
// then the position of the matching closing quote is returned. If no
@@ -507,7 +646,7 @@ namespace IceUtilInternal
public int Compare(string l, string r)
{
return string.CompareOrdinal(l, r);
- }
+ }
}
public static System.Collections.Generic.IComparer<string> OrdinalStringComparer =
new OrdinalStringComparerImpl();