diff options
Diffstat (limited to 'js/src/Ice/StringUtil.js')
-rw-r--r-- | js/src/Ice/StringUtil.js | 347 |
1 files changed, 224 insertions, 123 deletions
diff --git a/js/src/Ice/StringUtil.js b/js/src/Ice/StringUtil.js index 1673f4d745a..c23273e6ba8 100644 --- a/js/src/Ice/StringUtil.js +++ b/js/src/Ice/StringUtil.js @@ -7,7 +7,7 @@ // // ********************************************************************** -const Ice = require("../Ice/Debug").Ice; +const Ice = require("../Ice/Debug").Ice; const Debug = Ice.Debug; Ice.StringUtil = class @@ -49,11 +49,9 @@ Ice.StringUtil = class return -1; } // - // Add escape sequences (such as "\n", or "\007") to make a string - // readable in ASCII. Any characters that appear in special are - // prefixed with a backlash in the returned string. + // Add escape sequences (such as "\n", or "\123") to s // - static escapeString(s, special) + static escapeString(s, special, toStringMode) { special = special === undefined ? null : special; if(special !== null) @@ -68,26 +66,51 @@ Ice.StringUtil = class } let result = []; - for(let i = 0; i < s.length; ++i) + + if(toStringMode === Ice.ToStringMode.Compat) { - const c = s.charCodeAt(i); - if(c < 128) - { - encodeChar(c, result, special); - } - else if(c > 127 && c < 2048) + // Encode UTF-8 bytes + var bytes = unescape(encodeURIComponent(s)); + for(let i = 0; i < bytes.length; ++i) { - encodeChar((c >> 6) | 192, result, special); - encodeChar((c & 63) | 128, result, special); + const c = bytes.charCodeAt(i); + encodeChar(c, result, special, toStringMode); } - else + } + else + { + for(let i = 0; i < s.length; ++i) { - encodeChar((c >> 12) | 224, result, special); - encodeChar(((c >> 6) & 63) | 128, result, special); - encodeChar((c & 63) | 128, result, special); + const c = s.charCodeAt(i); + if(toStringMode === Ice.ToStringMode.Unicode || c < 0xD800 || c > 0xDFFF) + { + encodeChar(c, result, special, toStringMode); + } + else + { + Debug.assert(toStringMode === Ice.ToStringMode.ASCII && c >= 0xD800 && c <= 0xDFFF); + if(i + 1 === s.length) + { + throw new Error("High surrogate without low surrogate"); + } + else + { + const codePoint = s.codePointAt(i); + Debug.assert(codePoint > 0xFFFF); + i++; + + // append \Unnnnnnnn + result.push("\\U"); + const hex = codePoint.toString(16); + for(let j = hex.length; j < 8; j++) + { + result.push('0'); + } + result.push(hex); + } + } } } - return result.join(""); } // @@ -101,10 +124,26 @@ Ice.StringUtil = class Debug.assert(start >= 0 && start <= end && end <= s.length); - const arr = []; - decodeString(s, start, end, arr); - - return arr.join(""); + // Optimization for strings without escapes + let p = s.indexOf('\\', start); + if(p == -1 || p >= end) + { + p = start; + while(p < end) + { + checkChar(s, p++); + } + return s.substring(start, end); + } + else + { + const arr = []; + while(start < end) + { + start = decodeChar(s, start, end, arr); + } + return arr.join(""); + } } // // Split string helper; returns null for unmatched quotes @@ -217,15 +256,10 @@ Ice.StringUtil = class }; module.exports.Ice = Ice; -// -// Write the byte b as an escape sequence if it isn't a printable ASCII -// character and append the escape sequence to sb. Additional characters -// that should be escaped can be passed in special. If b is any of these -// characters, b is preceded by a backslash in sb. -// -function encodeChar(b, sb, special) + +function encodeChar(c, sb, special, toStringMode) { - switch(b) + switch(c) { case 92: // '\\' { @@ -269,44 +303,71 @@ function encodeChar(b, sb, special) } default: { - if(!(b >= 32 && b <= 126)) + var s = String.fromCharCode(c); + + if(special !== null && special.indexOf(s) !== -1) { sb.push('\\'); - const octal = b.toString(8); - // - // Add leading zeroes so that we avoid problems during - // decoding. For example, consider the encoded string - // \0013 (i.e., a character with value 1 followed by - // the character '3'). If the leading zeroes were omitted, - // the result would be incorrectly interpreted by the - // decoder as a single character with value 11. - // - for(let j = octal.length; j < 3; j++) - { - sb.push('0'); - } - sb.push(octal); + sb.push(s); } else { - const c = String.fromCharCode(b); - if(special !== null && special.indexOf(c) !== -1) + if(c < 32 || c > 126) { - sb.push('\\'); - sb.push(c); + if(toStringMode === Ice.ToStringMode.Compat) + { + // + // When ToStringMode=Compat, c is a UTF-8 byte + // + Debug.assert(c < 256); + sb.push('\\'); + const octal = c.toString(8); + // + // Add leading zeroes so that we avoid problems during + // decoding. For example, consider the encoded string + // \0013 (i.e., a character with value 1 followed by + // the character '3'). If the leading zeroes were omitted, + // the result would be incorrectly interpreted by the + // decoder as a single character with value 11. + // + for(let j = octal.length; j < 3; j++) + { + sb.push('0'); + } + sb.push(octal); + } + else if(c < 32 || c == 127 || toStringMode === Ice.ToStringMode.ASCII) + { + // append \\unnnn + sb.push("\\u"); + const hex = c.toString(16); + for(let j = hex.length; j < 4; j++) + { + sb.push('0'); + } + sb.push(hex); + } + else + { + // keep as is + sb.push(s); + } } else { - sb.push(c); + // printable ASCII character + sb.push(s); } } + break; } } } + function checkChar(s, pos) { - const n = s.charCodeAt(pos); - if(!(n >= 32 && n <= 126)) + const c = s.charCodeAt(pos); + if(c < 32 || c === 127) { let msg; if(pos > 0) @@ -317,32 +378,25 @@ function checkChar(s, pos) { msg = "first character"; } - msg += " is not a printable ASCII character (ordinal " + n + ")"; + msg += " has invalid ordinal value" + c; throw new Error(msg); } - return n; + return s.charAt(pos) } - // -// Decode the character or escape sequence starting at start and return it. -// nextStart is set to the index of the first character following the decoded -// character or escape sequence. +// Decode the character or escape sequence starting at start and appends it to result; +// returns the index of the first character following the decoded character +// or escape sequence. // -function decodeChar(s, start, end, nextStart) +function decodeChar(s, start, end, result) { Debug.assert(start >= 0); + Debug.assert(start < end); Debug.assert(end <= s.length); - if(start >= end) - { - throw new Error("EOF while decoding string"); - } - - let c; - if(s.charAt(start) != '\\') { - c = checkChar(s, start++); + result.push(checkChar(s, start++)); } else { @@ -350,43 +404,94 @@ function decodeChar(s, start, end, nextStart) { throw new Error("trailing backslash"); } - switch(s.charAt(++start)) + + const c = s.charAt(++start); + + switch(c) { case '\\': case '\'': case '"': { - c = s.charCodeAt(start++); + ++start; + result.push(c); break; } case 'b': { ++start; - c = "\b".charCodeAt(0); + result.push("\b"); break; } case 'f': { ++start; - c = "\f".charCodeAt(0); + result.push("\f"); break; } case 'n': { ++start; - c = "\n".charCodeAt(0); + result.push("\n"); break; } case 'r': { ++start; - c = "\r".charCodeAt(0); + result.push("\r") break; } case 't': { ++start; - c = "\t".charCodeAt(0); + result.push("\t") + break; + } + case 'u': + case 'U': + { + let codePoint = 0; + const inBMP = (c === 'u'); + let size = inBMP ? 4 : 8; + ++start; + while(size > 0 && start < end) + { + let charVal = s.charCodeAt(start++); + if(charVal >= 0x30 && charVal <= 0x39) + { + charVal -= 0x30; + } + else if(charVal >= 0x61 && charVal <= 0x66) + { + charVal += 10 - 0x61; + } + else if(charVal >= 0x41 && charVal <= 0x46) + { + charVal += 10 - 0x41; + } + else + { + break; // while + } + codePoint = codePoint * 16 + charVal; + --size; + } + if(size > 0) + { + throw new Error("Invalid universal character name: too few hex digits"); + } + if(inBMP && codePoint >= 0xD800 && codePoint <= 0xDFFF) + { + throw new Error("A non-BMP character cannot be encoded with \\unnnn, use \\Unnnnnnnn instead"); + } + if(inBMP || codePoint <= 0xFFFF) + { + result.push(String.fromCharCode(codePoint)); + } + else + { + result.push(String.fromCodePoint(codePoint)); + } break; } case '0': @@ -398,65 +503,61 @@ function decodeChar(s, start, end, nextStart) case '6': case '7': { - const octalChars = "01234567"; - let val = 0; - for(let j = 0; j < 3 && start < end; ++j) + // UTF-8 byte sequence encoded with octal escapes + + let arr = []; + let done = false; + while(!done) { - const ch = s.charAt(start++); - if(octalChars.indexOf(ch) == -1) + let val = 0; + for(let j = 0; j < 3 && start < end; ++j) { - --start; - break; + let charVal = s.charCodeAt(start++) - '0'.charCodeAt(0); + if(charVal < 0 || charVal > 7) + { + --start; + if(j === 0) + { + // first character after escape is not 0-7: + done = true; + --start; // go back to the previous backslash + } + break; // for + } + val = val * 8 + charVal; + } + + if(!done) + { + if(val > 255) + { + throw new Error("octal value \\" + val.toString(8) + " (" + val + ") is out of range"); + } + arr.push(String.fromCharCode(val)); + + if((start + 1 < end) && s.charAt(start) === '\\') + { + start++; + // loop, read next octal escape sequence + } + else + { + done = true; + } } - val = val * 8 + parseInt(ch); - } - if(val > 255) - { - throw new Error("octal value \\" + val.toString(8) + " (" + val + ") is out of range"); } - c = val; + + // Decode UTF-8 arr into string + result.push(decodeURIComponent(escape(arr.join("")))); break; } default: { - c = checkChar(s, start++); + result.push(checkChar(s, start++)); break; } } } - nextStart.value = start; - return c; -} -// -// Remove escape sequences from s and append the result to sb. -// Return true if successful, false otherwise. -// -function decodeString(s, start, end, arr) -{ - let nextStart = { 'value': 0 }; - while(start < end) - { - const c = decodeChar(s, start, end, nextStart); - start = nextStart.value; - - if(c < 128) - { - arr.push(String.fromCharCode(c)); - } - else if(c > 191 && c < 224) - { - const c2 = decodeChar(s, start, end, nextStart); - start = nextStart.value; - arr.push(String.fromCharCode(((c & 31) << 6) | (c2 & 63))); - } - else - { - const c2 = decodeChar(s, start, end, nextStart); - start = nextStart.value; - const c3 = decodeChar(s, start, end, nextStart); - start = nextStart.value; - arr.push(String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63))); - } - } + return start; } |