diff options
Diffstat (limited to 'java-compat/src/Ice/src/main/java/IceUtilInternal/StringUtil.java')
-rw-r--r-- | java-compat/src/Ice/src/main/java/IceUtilInternal/StringUtil.java | 362 |
1 files changed, 251 insertions, 111 deletions
diff --git a/java-compat/src/Ice/src/main/java/IceUtilInternal/StringUtil.java b/java-compat/src/Ice/src/main/java/IceUtilInternal/StringUtil.java index 953359c5c4b..5c274db2375 100644 --- a/java-compat/src/Ice/src/main/java/IceUtilInternal/StringUtil.java +++ b/java-compat/src/Ice/src/main/java/IceUtilInternal/StringUtil.java @@ -75,97 +75,119 @@ public final class StringUtil return -1; } - // - // Write the byte b as an escape sequence if it isn't a printable ASCII - // character and append the escape sequence to sb. Additional characters - // that should be escaped can be passed in special. If b is any of these - // characters, b is preceded by a backslash in sb. - // private static void - encodeChar(byte b, StringBuilder sb, String special) + encodeChar(char c, StringBuilder sb, String special, Ice.ToStringMode toStringMode) { - switch(b) + switch(c) { - case (byte)'\\': + case '\\': { sb.append("\\\\"); break; } - case (byte)'\'': + case '\'': { sb.append("\\'"); break; } - case (byte)'"': + case '"': { sb.append("\\\""); break; } - case (byte)'\b': + case '\b': { sb.append("\\b"); break; } - case (byte)'\f': + case '\f': { sb.append("\\f"); break; } - case (byte)'\n': + case '\n': { sb.append("\\n"); break; } - case (byte)'\r': + case '\r': { sb.append("\\r"); break; } - case (byte)'\t': + case '\t': { sb.append("\\t"); break; } default: { - if(!(b >= 32 && b <= 126)) - { - sb.append('\\'); - String octal = Integer.toOctalString(b < 0 ? b + 256 : b); - // - // Add leading zeroes so that we avoid problems during - // decoding. For example, consider the encoded string - // \0013 (i.e., a character with value 1 followed by - // the character '3'). If the leading zeroes were omitted, - // the result would be incorrectly interpreted by the - // decoder as a single character with value 11. - // - for(int j = octal.length(); j < 3; j++) - { - sb.append('0'); - } - sb.append(octal); - } - else if(special != null && special.indexOf((char)b) != -1) + if(special != null && special.indexOf(c) != -1) { sb.append('\\'); - sb.append((char)b); + sb.append(c); } else { - sb.append((char)b); + if(c < 32 || c > 126) + { + if(toStringMode == Ice.ToStringMode.Compat) + { + // + // When ToStringMode=Compat, c is a UTF-8 byte + // + assert(c < 256); + + sb.append('\\'); + String octal = Integer.toOctalString(c); + // + // Add leading zeroes so that we avoid problems during + // decoding. For example, consider the encoded string + // \0013 (i.e., a character with value 1 followed by + // the character '3'). If the leading zeroes were omitted, + // the result would be incorrectly interpreted by the + // decoder as a single character with value 11. + // + for(int j = octal.length(); j < 3; j++) + { + sb.append('0'); + } + sb.append(octal); + } + else if(c < 32 || c == 127 || toStringMode == Ice.ToStringMode.ASCII) + { + // append \\unnnn + sb.append("\\u"); + String hex = Integer.toHexString(c); + for(int j = hex.length(); j < 4; j++) + { + sb.append('0'); + } + sb.append(hex); + } + else + { + // keep as is + sb.append(c); + } + } + else + { + // printable ASCII character + sb.append(c); + } } + break; } } } // - // Add escape sequences (such as "\n", or "\007") to make a string - // readable in ASCII. Any characters that appear in special are - // prefixed with a backlash in the returned string. + // Add escape sequences (like "\n" to the input string) + // The second parameter adds characters to escape, and can be empty. // public static String - escapeString(String s, String special) + escapeString(String s, String special, Ice.ToStringMode toStringMode) { if(special != null) { @@ -178,31 +200,72 @@ public final class StringUtil } } - byte[] bytes = null; - try + if(toStringMode == Ice.ToStringMode.Compat) { - bytes = s.getBytes("UTF8"); + // Encode UTF-8 bytes + + byte[] bytes = null; + try + { + bytes = s.getBytes("UTF8"); + } + catch(java.io.UnsupportedEncodingException ex) + { + assert(false); + return null; + } + + StringBuilder result = new StringBuilder(bytes.length); + for(int i = 0; i < bytes.length; i++) + { + encodeChar((char)(bytes[i] & 0xFF), result, special, toStringMode); + } + + return result.toString(); } - catch(java.io.UnsupportedEncodingException ex) + else { - assert(false); - return null; - } + StringBuilder result = new StringBuilder(s.length()); - StringBuilder result = new StringBuilder(bytes.length); - for(int i = 0; i < bytes.length; i++) - { - encodeChar(bytes[i], result, special); - } + for(int i = 0; i < s.length(); i++) + { + char c = s.charAt(i); + if(toStringMode == Ice.ToStringMode.Unicode || !Character.isSurrogate(c)) + { + encodeChar(c, result, special, toStringMode); + } + else + { + assert(toStringMode == Ice.ToStringMode.ASCII && Character.isSurrogate(c)); + if(i + 1 == s.length()) + { + throw new IllegalArgumentException("High surrogate without low surrogate"); + } + else + { + i++; + int codePoint = Character.toCodePoint(c, s.charAt(i)); + // append \Unnnnnnnn + result.append("\\U"); + String hex = Integer.toHexString(codePoint); + for(int j = hex.length(); j < 8; j++) + { + result.append('0'); + } + result.append(hex); + } + } + } - return result.toString(); + return result.toString(); + } } private static char checkChar(String s, int pos) { char c = s.charAt(pos); - if(!(c >= 32 && c <= 126)) + if(c < 32 || c == 127) { String msg; if(pos > 0) @@ -213,28 +276,27 @@ public final class StringUtil { msg = "first character"; } - msg += " is not a printable ASCII character (ordinal " + (int)c + ")"; + msg += " has invalid ordinal value " + (int)c; throw new IllegalArgumentException(msg); } return c; } // - // Decode the character or escape sequence starting at start and return it. - // newStart is set to the index of the first character following the decoded character + // Decode the character or escape sequence starting at start and appends it to result; + // returns the index of the first character following the decoded character // or escape sequence. // - private static char decodeChar(String s, int start, int end, Ice.Holder<Integer> nextStart) + private static int + decodeChar(String s, int start, int end, StringBuilder result) { assert(start >= 0); assert(start < end); assert(end <= s.length()); - char c; - if(s.charAt(start) != '\\') { - c = checkChar(s, start++); + result.append(checkChar(s, start++)); } else { @@ -242,45 +304,98 @@ public final class StringUtil { throw new IllegalArgumentException("trailing backslash"); } - switch(s.charAt(++start)) + + char c = s.charAt(++start); + + switch(c) { case '\\': case '\'': case '"': { - c = s.charAt(start++); + ++start; + result.append(c); break; } case 'b': { ++start; - c = '\b'; + result.append('\b'); break; } case 'f': { ++start; - c = '\f'; + result.append('\f'); break; } case 'n': { ++start; - c = '\n'; + result.append('\n'); break; } case 'r': { ++start; - c = '\r'; + result.append('\r'); break; } case 't': { ++start; - c = '\t'; + result.append('\t'); break; } + case 'u': + case 'U': + { + int codePoint = 0; + boolean inBMP = (c == 'u'); + int size = inBMP ? 4 : 8; + ++start; + while(size > 0 && start < end) + { + c = s.charAt(start++); + int charVal = 0; + if(c >= '0' && c <= '9') + { + charVal = c - '0'; + } + else if(c >= 'a' && c <= 'f') + { + charVal = 10 + (c - 'a'); + } + else if(c >= 'A' && c <= 'F') + { + charVal = 10 + (c - 'A'); + } + else + { + break; // while + } + codePoint = codePoint * 16 + charVal; + --size; + } + if(size > 0) + { + throw new IllegalArgumentException("Invalid universal character name: too few hex digits"); + } + if(inBMP && Character.isSurrogate((char)codePoint)) + { + throw new IllegalArgumentException("A non-BMP character cannot be encoded with \\unnnn, use \\Unnnnnnnn instead"); + } + if(inBMP || Character.isBmpCodePoint(codePoint)) + { + result.append((char)codePoint); + } + else + { + result.append(Character.toChars(codePoint)); + } + break; + } + case '0': case '1': case '2': @@ -290,49 +405,71 @@ public final class StringUtil case '6': case '7': { - int val = 0; - for(int j = 0; j < 3 && start < end; ++j) + // UTF-8 byte sequence encoded with octal escapes + + byte[] arr = new byte[end - start]; + int i = 0; + boolean done = false; + while(!done) { - int charVal = s.charAt(start++) - '0'; - if(charVal < 0 || charVal > 7) + int val = 0; + for(int j = 0; j < 3 && start < end; ++j) { - --start; - break; + int charVal = s.charAt(start++) - '0'; + if(charVal < 0 || charVal > 7) + { + --start; + if(j == 0) + { + // first character after escape is not 0-7: + done = true; + --start; // go back to the previous backslash + } + break; // for + } + val = val * 8 + charVal; } - val = val * 8 + charVal; + + if(!done) + { + if(val > 255) + { + String msg = "octal value \\" + Integer.toOctalString(val) + " (" + val + ") is out of range"; + throw new IllegalArgumentException(msg); + } + arr[i++] = (byte)val; + + if((start + 1 < end) && s.charAt(start) == '\\') + { + start++; + // loop, read next octal escape sequence + } + else + { + done = true; + } + } + } + + try + { + result.append(new String(arr, 0, i, "UTF8")); } - if(val > 255) + catch(java.io.UnsupportedEncodingException ex) { - String msg = "octal value \\" + Integer.toOctalString(val) + " (" + val + ") is out of range"; - throw new IllegalArgumentException(msg); + throw new IllegalArgumentException("unsupported encoding", ex); } - c = (char)val; break; } default: { - c = checkChar(s, start++); + result.append(checkChar(s, start++)); break; } } } - nextStart.value = start; - return c; - } - // - // Remove escape sequences from s and append the result to sb. - // Return true if successful, false otherwise. - // - private static void - decodeString(String s, int start, int end, StringBuilder sb) - { - Ice.Holder<Integer> nextStart = new Ice.Holder<Integer>(); - while(start < end) - { - sb.append(decodeChar(s, start, end, nextStart)); - start = nextStart.value; - } + return start; } // @@ -344,26 +481,29 @@ public final class StringUtil { assert(start >= 0 && start <= end && end <= s.length()); - StringBuilder sb = new StringBuilder(end - start); - decodeString(s, start, end, sb); - String decodedString = sb.toString(); - - byte[] arr = new byte[decodedString.length()]; - for(int i = 0; i < arr.length; ++i) - { - arr[i] = (byte)decodedString.charAt(i); - } - - try + // Optimization for strings without escapes + int p = s.indexOf('\\', start); + if(p == -1 || p >= end) { - return new String(arr, 0, arr.length, "UTF8"); + p = start; + while(p < end) + { + checkChar(s, p++); + } + return s.substring(start, end); } - catch(java.io.UnsupportedEncodingException ex) + else { - throw new IllegalArgumentException("unsupported encoding", ex); + StringBuilder sb = new StringBuilder(end - start); + while(start < end) + { + start = decodeChar(s, start, end, sb); + } + return sb.toString(); } } + // // Join a list of strings using the given delimiter. // |