diff options
Diffstat (limited to 'csharp/src/Ice/StringUtil.cs')
-rw-r--r-- | csharp/src/Ice/StringUtil.cs | 497 |
1 files changed, 357 insertions, 140 deletions
diff --git a/csharp/src/Ice/StringUtil.cs b/csharp/src/Ice/StringUtil.cs index 76dbbcda543..8fc7555dd0c 100644 --- a/csharp/src/Ice/StringUtil.cs +++ b/csharp/src/Ice/StringUtil.cs @@ -25,7 +25,7 @@ namespace IceUtilInternal { return findFirstOf(str, match, 0); } - + // // Return the index of the first character in str to // appear in match, starting from start. Returns -1 if none is @@ -42,10 +42,10 @@ namespace IceUtilInternal return i; } } - + return -1; } - + // // Return the index of the first character in str which does // not appear in match, starting from 0. Returns -1 if none is @@ -55,7 +55,7 @@ namespace IceUtilInternal { return findFirstNotOf(str, match, 0); } - + // // Return the index of the first character in str which does // not appear in match, starting from start. Returns -1 if none is @@ -67,119 +67,159 @@ namespace IceUtilInternal for(int i = start; i < len; i++) { char ch = str[i]; - if(match.IndexOf((char) ch) == -1) + if(match.IndexOf(ch) == -1) { return i; } } - + return -1; } - - // - // Write the byte b as an escape sequence if it isn't a printable ASCII - // character and append the escape sequence to sb. Additional characters - // that should be escaped can be passed in special. If b is any of these - // characters, b is preceded by a backslash in sb. - // - private static void encodeChar(byte b, StringBuilder sb, string special) + + private static void + encodeChar(char c, StringBuilder sb, string special, Ice.ToStringMode toStringMode) { - switch((char)b) + switch(c) { - case '\\': + case '\\': { sb.Append("\\\\"); break; } - - case '\'': + case '\'': { sb.Append("\\'"); break; } - - case '"': + case '"': { sb.Append("\\\""); break; } - - case '\b': + case '\a': + { + if(toStringMode == Ice.ToStringMode.Compat) + { + // Octal escape for compatibility with 3.6 and earlier + sb.Append("\\007"); + } + else + { + sb.Append("\\a"); + } + break; + } + case '\b': { sb.Append("\\b"); break; } - - case '\f': + case '\f': { sb.Append("\\f"); break; } - - case '\n': + case '\n': { sb.Append("\\n"); break; } - - case '\r': + case '\r': { sb.Append("\\r"); break; } - - case '\t': + case '\t': { sb.Append("\\t"); break; } - - default: + case '\v': { - if(!(b >= 32 && b <= 126)) + if(toStringMode == Ice.ToStringMode.Compat) { - sb.Append('\\'); - string octal = System.Convert.ToString(b, 8); - // - // Add leading zeroes so that we avoid problems during - // decoding. For example, consider the encoded string - // \0013 (i.e., a character with value 1 followed by - // the character '3'). If the leading zeroes were omitted, - // the result would be incorrectly interpreted by the - // decoder as a single character with value 11. - // - for(int j = octal.Length; j < 3; j++) - { - sb.Append('0'); - } - sb.Append(octal); + // Octal escape for compatibility with 3.6 and earlier + sb.Append("\\013"); } - else if(special != null && special.IndexOf((char)b) != -1) + else + { + sb.Append("\\v"); + } + break; + } + default: + { + if(special != null && special.IndexOf(c) != -1) { sb.Append('\\'); - sb.Append((char)b); + sb.Append(c); } else { - sb.Append((char)b); + int i = (int)c; + if(i < 32 || i > 126) + { + if(toStringMode == Ice.ToStringMode.Compat) + { + // + // When ToStringMode=Compat, c is a UTF-8 byte + // + Debug.Assert(i < 256); + + sb.Append('\\'); + string octal = System.Convert.ToString(i, 8); + // + // Add leading zeroes so that we avoid problems during + // decoding. For example, consider the encoded string + // \0013 (i.e., a character with value 1 followed by + // the character '3'). If the leading zeroes were omitted, + // the result would be incorrectly interpreted by the + // decoder as a single character with value 11. + // + for(int j = octal.Length; j < 3; j++) + { + sb.Append('0'); + } + sb.Append(octal); + } + else if(i < 32 || i == 127 || toStringMode == Ice.ToStringMode.ASCII) + { + // append \\unnnn + sb.Append("\\u"); + string hex = System.Convert.ToString(i, 16); + for(int j = hex.Length; j < 4; j++) + { + sb.Append('0'); + } + sb.Append(hex); + } + else + { + // keep as is + sb.Append(c); + } + } + else + { + // printable ASCII character + sb.Append(c); + } } - } break; + } } } - + // - // Add escape sequences (such as "\n", or "\007") to make a string - // readable in ASCII. Any characters that appear in special are - // prefixed with a backslash in the returned string. + // Add escape sequences (such as "\n", or "\007") to the input string // - public static string escapeString(string s, string special) + public static string escapeString(string s, string special, Ice.ToStringMode toStringMode) { if(special != null) { for(int i = 0; i < special.Length; ++i) { - if((int)special[i] < 32 || (int)special[i] > 126) + if(special[i] < 32 || special[i] > 126) { throw new System.ArgumentException("special characters must be in ASCII range 32-126", "special"); @@ -187,22 +227,64 @@ namespace IceUtilInternal } } - UTF8Encoding utf8 = new UTF8Encoding(); - byte[] bytes = utf8.GetBytes(s); + if(toStringMode == Ice.ToStringMode.Compat) + { + // Encode UTF-8 bytes + + UTF8Encoding utf8 = new UTF8Encoding(); + byte[] bytes = utf8.GetBytes(s); - StringBuilder result = new StringBuilder(bytes.Length); - for(int i = 0; i < bytes.Length; i++) + StringBuilder result = new StringBuilder(bytes.Length); + for(int i = 0; i < bytes.Length; i++) + { + encodeChar((char)bytes[i], result, special, toStringMode); + } + + return result.ToString(); + } + else { - encodeChar(bytes[i], result, special); + StringBuilder result = new StringBuilder(s.Length); + + for(int i = 0; i < s.Length; i++) + { + char c = s[i]; + if(toStringMode == Ice.ToStringMode.Unicode || !char.IsSurrogate(c)) + { + encodeChar(c, result, special, toStringMode); + } + else + { + Debug.Assert(toStringMode == Ice.ToStringMode.ASCII && char.IsSurrogate(c)); + if(i + 1 == s.Length) + { + throw new System.ArgumentException("High surrogate without low surrogate"); + } + else + { + i++; + int codePoint = char.ConvertToUtf32(c, s[i]); + // append \Unnnnnnnn + result.Append("\\U"); + string hex = System.Convert.ToString(codePoint, 16); + for(int j = hex.Length; j < 8; j++) + { + result.Append('0'); + } + result.Append(hex); + } + } + } + + return result.ToString(); } - - return result.ToString(); } - - private static char checkChar(string s, int pos) + + private static char + checkChar(string s, int pos) { char c = s[pos]; - if(!(c >= 32 && c <= 126)) + if(c < 32 || c == 127) { string msg; if(pos > 0) @@ -219,69 +301,134 @@ namespace IceUtilInternal return c; } + // - // Decode the character or escape sequence starting at start and return it. - // end marks the one-past-the-end position of the substring to be scanned. - // nextStart is set to the index of the first character following the decoded - // character or escape sequence. + // Decode the character or escape sequence starting at start and appends it to result; + // returns the index of the first character following the decoded character + // or escape sequence. // - private static char decodeChar(string s, int start, int end, out int nextStart) + private static int + decodeChar(string s, int start, int end, string special, StringBuilder result, UTF8Encoding utf8Encoding) { Debug.Assert(start >= 0); Debug.Assert(start < end); Debug.Assert(end <= s.Length); - char c; - if(s[start] != '\\') { - c = checkChar(s, start++); + result.Append(checkChar(s, start++)); + } + else if(start + 1 == end) + { + ++start; + result.Append('\\'); // trailing backslash } else { - if(start + 1 == end) - { - throw new System.ArgumentException("trailing backslash"); - } - switch(s[++start]) + char c = s[++start]; + + switch(c) { - case '\\': - case '\'': - case '"': + case '\\': + case '\'': + case '"': + case '?': { - c = s[start++]; + ++start; + result.Append(c); break; } - case 'b': + case 'a': { ++start; - c = '\b'; + result.Append('\a'); break; } - case 'f': + case 'b': { ++start; - c = '\f'; + result.Append('\b'); break; } - case 'n': + case 'f': { ++start; - c = '\n'; + result.Append('\f'); break; } - case 'r': + case 'n': { ++start; - c = '\r'; + result.Append('\n'); break; } - case 't': + case 'r': { ++start; - c = '\t'; + result.Append('\r'); break; } + case 't': + { + ++start; + result.Append('\t'); + break; + } + case 'v': + { + ++start; + result.Append('\v'); + break; + } + case 'u': + case 'U': + { + int codePoint = 0; + bool inBMP = (c == 'u'); + int size = inBMP ? 4 : 8; + ++start; + while(size > 0 && start < end) + { + c = s[start++]; + int charVal = 0; + if(c >= '0' && c <= '9') + { + charVal = c - '0'; + } + else if(c >= 'a' && c <= 'f') + { + charVal = 10 + (c - 'a'); + } + else if(c >= 'A' && c <= 'F') + { + charVal = 10 + (c - 'A'); + } + else + { + break; // while + } + codePoint = codePoint * 16 + charVal; + --size; + } + if(size > 0) + { + throw new System.ArgumentException("Invalid universal character name: too few hex digits"); + } + if(codePoint >= 0xD800 && codePoint <= 0xDFFF) + { + throw new System.ArgumentException("A universal character name cannot designate a surrogate"); + } + if(inBMP || codePoint <= 0xFFFF) + { + result.Append((char)codePoint); + } + else + { + result.Append(char.ConvertFromUtf32(codePoint)); + } + break; + } + case '0': case '1': case '2': @@ -290,70 +437,141 @@ namespace IceUtilInternal case '5': case '6': case '7': + case 'x': { - int val = 0; - for(int j = 0; j < 3 && start < end; ++j) + // UTF-8 byte sequence encoded with octal escapes + + byte[] arr = new byte[end - start]; + int i = 0; + bool more = true; + while(more) { - int charVal = s[start++] - '0'; - if(charVal < 0 || charVal > 7) + int val = 0; + if(c == 'x') { - --start; - break; + int size = 2; + ++start; + while(size > 0 && start < end) + { + c = s[start++]; + int charVal = 0; + if(c >= '0' && c <= '9') + { + charVal = c - '0'; + } + else if(c >= 'a' && c <= 'f') + { + charVal = 10 + (c - 'a'); + } + else if(c >= 'A' && c <= 'F') + { + charVal = 10 + (c - 'A'); + } + else + { + --start; // move back + break; // while + } + val = val * 16 + charVal; + --size; + } + if(size == 2) + { + throw new System.ArgumentException("Invalid \\x escape sequence: no hex digit"); + } + } + else + { + for(int j = 0; j < 3 && start < end; ++j) + { + int charVal = s[start++] - '0'; + if(charVal < 0 || charVal > 7) + { + --start; // move back + Debug.Assert(j != 0); // must be at least one digit + break; // for + } + val = val * 8 + charVal; + } + if(val > 255) + { + string msg = "octal value \\" + System.Convert.ToString(val, 8) + " (" + val + ") is out of range"; + throw new System.ArgumentException(msg); + } + } + + arr[i++] = (byte)val; + + more = false; + + if((start + 1 < end) && s[start] == '\\') + { + c = s[start + 1]; + if(c == 'x' || (c >= '0' && c <= '9')) + { + start++; + more = true; + } } - val = val * 8 + charVal; - } - if(val > 255) - { - string msg = "octal value \\" + System.Convert.ToString(val, 8) + " (" + val + - ") is out of range"; - throw new System.ArgumentException(msg, "s"); } - c = System.Convert.ToChar(val); + + result.Append(utf8Encoding.GetString(arr, 0, i)); // May raise ArgumentException. break; } default: { - c = checkChar(s, start++); + if(string.IsNullOrEmpty(special) || special.IndexOf(c) == -1) + { + result.Append('\\'); // not in special, so we keep the backslash + } + result.Append(checkChar(s, start++)); break; } } } - nextStart = start; - return c; - } - - // - // Remove escape sequences from s and append the result to sb. - // Return true if successful, false otherwise. - // - private static void decodeString(string s, int start, int end, StringBuilder sb) - { - while(start < end) - { - sb.Append(decodeChar(s, start, end, out start)); - } + return start; } // // Remove escape sequences added by escapeString. Throws System.ArgumentException // for an invalid input string. // - public static string unescapeString(string s, int start, int end) + public static string unescapeString(string s, int start, int end, string special) { Debug.Assert(start >= 0 && start <= end && end <= s.Length); - StringBuilder sb = new StringBuilder(); - decodeString(s, start, end, sb); - string decodedString = sb.ToString(); - - byte[] arr = new byte[decodedString.Length]; - for(int i = 0; i < arr.Length; ++i) + if(special != null) { - arr[i] = (byte)decodedString[i]; + for(int i = 0; i < special.Length; ++i) + { + if(special[i] < 32 || special[i] > 126) + { + throw new System.ArgumentException("special characters must be in ASCII range 32-126", + "special"); + } + } } - UTF8Encoding utf8 = new UTF8Encoding(false, true); - return utf8.GetString(arr, 0, arr.Length); // May raise ArgumentException. + // Optimization for strings without escapes + if(start == end || s.IndexOf('\\', start, end - start) == -1) + { + int p = start; + while(p < end) + { + checkChar(s, p++); + } + return s.Substring(start, end - start); + } + else + { + StringBuilder sb = new StringBuilder(end - start); + UTF8Encoding utf8Encoding = new UTF8Encoding(false, true); + while(start < end) + { + start = decodeChar(s, start, end, special, sb, utf8Encoding); + } + return sb.ToString(); + } } // @@ -402,7 +620,7 @@ namespace IceUtilInternal continue; } } - + if(pos < str.Length) { arr[n++] = str[pos++]; @@ -425,7 +643,7 @@ namespace IceUtilInternal { return checkQuote(s, 0); } - + // // If a single or double quotation mark is found at the start position, // then the position of the matching closing quote is returned. If no @@ -502,14 +720,13 @@ namespace IceUtilInternal return true; } - private class OrdinalStringComparerImpl : System.Collections.Generic.IComparer<string> + private class OrdinalStringComparerImpl : IComparer<string> { public int Compare(string l, string r) { return string.CompareOrdinal(l, r); - } + } } - public static System.Collections.Generic.IComparer<string> OrdinalStringComparer = - new OrdinalStringComparerImpl(); + public static IComparer<string> OrdinalStringComparer = new OrdinalStringComparerImpl(); } } |