summaryrefslogtreecommitdiff
path: root/js/src/Ice/StringUtil.js
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/Ice/StringUtil.js')
-rw-r--r--js/src/Ice/StringUtil.js347
1 files changed, 224 insertions, 123 deletions
diff --git a/js/src/Ice/StringUtil.js b/js/src/Ice/StringUtil.js
index 1673f4d745a..c23273e6ba8 100644
--- a/js/src/Ice/StringUtil.js
+++ b/js/src/Ice/StringUtil.js
@@ -7,7 +7,7 @@
//
// **********************************************************************
-const Ice = require("../Ice/Debug").Ice;
+const Ice = require("../Ice/Debug").Ice;
const Debug = Ice.Debug;
Ice.StringUtil = class
@@ -49,11 +49,9 @@ Ice.StringUtil = class
return -1;
}
//
- // Add escape sequences (such as "\n", or "\007") to make a string
- // readable in ASCII. Any characters that appear in special are
- // prefixed with a backlash in the returned string.
+ // Add escape sequences (such as "\n", or "\123") to s
//
- static escapeString(s, special)
+ static escapeString(s, special, toStringMode)
{
special = special === undefined ? null : special;
if(special !== null)
@@ -68,26 +66,51 @@ Ice.StringUtil = class
}
let result = [];
- for(let i = 0; i < s.length; ++i)
+
+ if(toStringMode === Ice.ToStringMode.Compat)
{
- const c = s.charCodeAt(i);
- if(c < 128)
- {
- encodeChar(c, result, special);
- }
- else if(c > 127 && c < 2048)
+ // Encode UTF-8 bytes
+ var bytes = unescape(encodeURIComponent(s));
+ for(let i = 0; i < bytes.length; ++i)
{
- encodeChar((c >> 6) | 192, result, special);
- encodeChar((c & 63) | 128, result, special);
+ const c = bytes.charCodeAt(i);
+ encodeChar(c, result, special, toStringMode);
}
- else
+ }
+ else
+ {
+ for(let i = 0; i < s.length; ++i)
{
- encodeChar((c >> 12) | 224, result, special);
- encodeChar(((c >> 6) & 63) | 128, result, special);
- encodeChar((c & 63) | 128, result, special);
+ const c = s.charCodeAt(i);
+ if(toStringMode === Ice.ToStringMode.Unicode || c < 0xD800 || c > 0xDFFF)
+ {
+ encodeChar(c, result, special, toStringMode);
+ }
+ else
+ {
+ Debug.assert(toStringMode === Ice.ToStringMode.ASCII && c >= 0xD800 && c <= 0xDFFF);
+ if(i + 1 === s.length)
+ {
+ throw new Error("High surrogate without low surrogate");
+ }
+ else
+ {
+ const codePoint = s.codePointAt(i);
+ Debug.assert(codePoint > 0xFFFF);
+ i++;
+
+ // append \Unnnnnnnn
+ result.push("\\U");
+ const hex = codePoint.toString(16);
+ for(let j = hex.length; j < 8; j++)
+ {
+ result.push('0');
+ }
+ result.push(hex);
+ }
+ }
}
}
-
return result.join("");
}
//
@@ -101,10 +124,26 @@ Ice.StringUtil = class
Debug.assert(start >= 0 && start <= end && end <= s.length);
- const arr = [];
- decodeString(s, start, end, arr);
-
- return arr.join("");
+ // Optimization for strings without escapes
+ let p = s.indexOf('\\', start);
+ if(p == -1 || p >= end)
+ {
+ p = start;
+ while(p < end)
+ {
+ checkChar(s, p++);
+ }
+ return s.substring(start, end);
+ }
+ else
+ {
+ const arr = [];
+ while(start < end)
+ {
+ start = decodeChar(s, start, end, arr);
+ }
+ return arr.join("");
+ }
}
//
// Split string helper; returns null for unmatched quotes
@@ -217,15 +256,10 @@ Ice.StringUtil = class
};
module.exports.Ice = Ice;
-//
-// Write the byte b as an escape sequence if it isn't a printable ASCII
-// character and append the escape sequence to sb. Additional characters
-// that should be escaped can be passed in special. If b is any of these
-// characters, b is preceded by a backslash in sb.
-//
-function encodeChar(b, sb, special)
+
+function encodeChar(c, sb, special, toStringMode)
{
- switch(b)
+ switch(c)
{
case 92: // '\\'
{
@@ -269,44 +303,71 @@ function encodeChar(b, sb, special)
}
default:
{
- if(!(b >= 32 && b <= 126))
+ var s = String.fromCharCode(c);
+
+ if(special !== null && special.indexOf(s) !== -1)
{
sb.push('\\');
- const octal = b.toString(8);
- //
- // Add leading zeroes so that we avoid problems during
- // decoding. For example, consider the encoded string
- // \0013 (i.e., a character with value 1 followed by
- // the character '3'). If the leading zeroes were omitted,
- // the result would be incorrectly interpreted by the
- // decoder as a single character with value 11.
- //
- for(let j = octal.length; j < 3; j++)
- {
- sb.push('0');
- }
- sb.push(octal);
+ sb.push(s);
}
else
{
- const c = String.fromCharCode(b);
- if(special !== null && special.indexOf(c) !== -1)
+ if(c < 32 || c > 126)
{
- sb.push('\\');
- sb.push(c);
+ if(toStringMode === Ice.ToStringMode.Compat)
+ {
+ //
+ // When ToStringMode=Compat, c is a UTF-8 byte
+ //
+ Debug.assert(c < 256);
+ sb.push('\\');
+ const octal = c.toString(8);
+ //
+ // Add leading zeroes so that we avoid problems during
+ // decoding. For example, consider the encoded string
+ // \0013 (i.e., a character with value 1 followed by
+ // the character '3'). If the leading zeroes were omitted,
+ // the result would be incorrectly interpreted by the
+ // decoder as a single character with value 11.
+ //
+ for(let j = octal.length; j < 3; j++)
+ {
+ sb.push('0');
+ }
+ sb.push(octal);
+ }
+ else if(c < 32 || c == 127 || toStringMode === Ice.ToStringMode.ASCII)
+ {
+ // append \\unnnn
+ sb.push("\\u");
+ const hex = c.toString(16);
+ for(let j = hex.length; j < 4; j++)
+ {
+ sb.push('0');
+ }
+ sb.push(hex);
+ }
+ else
+ {
+ // keep as is
+ sb.push(s);
+ }
}
else
{
- sb.push(c);
+ // printable ASCII character
+ sb.push(s);
}
}
+ break;
}
}
}
+
function checkChar(s, pos)
{
- const n = s.charCodeAt(pos);
- if(!(n >= 32 && n <= 126))
+ const c = s.charCodeAt(pos);
+ if(c < 32 || c === 127)
{
let msg;
if(pos > 0)
@@ -317,32 +378,25 @@ function checkChar(s, pos)
{
msg = "first character";
}
- msg += " is not a printable ASCII character (ordinal " + n + ")";
+ msg += " has invalid ordinal value" + c;
throw new Error(msg);
}
- return n;
+ return s.charAt(pos)
}
-
//
-// Decode the character or escape sequence starting at start and return it.
-// nextStart is set to the index of the first character following the decoded
-// character or escape sequence.
+// Decode the character or escape sequence starting at start and appends it to result;
+// returns the index of the first character following the decoded character
+// or escape sequence.
//
-function decodeChar(s, start, end, nextStart)
+function decodeChar(s, start, end, result)
{
Debug.assert(start >= 0);
+ Debug.assert(start < end);
Debug.assert(end <= s.length);
- if(start >= end)
- {
- throw new Error("EOF while decoding string");
- }
-
- let c;
-
if(s.charAt(start) != '\\')
{
- c = checkChar(s, start++);
+ result.push(checkChar(s, start++));
}
else
{
@@ -350,43 +404,94 @@ function decodeChar(s, start, end, nextStart)
{
throw new Error("trailing backslash");
}
- switch(s.charAt(++start))
+
+ const c = s.charAt(++start);
+
+ switch(c)
{
case '\\':
case '\'':
case '"':
{
- c = s.charCodeAt(start++);
+ ++start;
+ result.push(c);
break;
}
case 'b':
{
++start;
- c = "\b".charCodeAt(0);
+ result.push("\b");
break;
}
case 'f':
{
++start;
- c = "\f".charCodeAt(0);
+ result.push("\f");
break;
}
case 'n':
{
++start;
- c = "\n".charCodeAt(0);
+ result.push("\n");
break;
}
case 'r':
{
++start;
- c = "\r".charCodeAt(0);
+ result.push("\r")
break;
}
case 't':
{
++start;
- c = "\t".charCodeAt(0);
+ result.push("\t")
+ break;
+ }
+ case 'u':
+ case 'U':
+ {
+ let codePoint = 0;
+ const inBMP = (c === 'u');
+ let size = inBMP ? 4 : 8;
+ ++start;
+ while(size > 0 && start < end)
+ {
+ let charVal = s.charCodeAt(start++);
+ if(charVal >= 0x30 && charVal <= 0x39)
+ {
+ charVal -= 0x30;
+ }
+ else if(charVal >= 0x61 && charVal <= 0x66)
+ {
+ charVal += 10 - 0x61;
+ }
+ else if(charVal >= 0x41 && charVal <= 0x46)
+ {
+ charVal += 10 - 0x41;
+ }
+ else
+ {
+ break; // while
+ }
+ codePoint = codePoint * 16 + charVal;
+ --size;
+ }
+ if(size > 0)
+ {
+ throw new Error("Invalid universal character name: too few hex digits");
+ }
+ if(inBMP && codePoint >= 0xD800 && codePoint <= 0xDFFF)
+ {
+ throw new Error("A non-BMP character cannot be encoded with \\unnnn, use \\Unnnnnnnn instead");
+ }
+ if(inBMP || codePoint <= 0xFFFF)
+ {
+ result.push(String.fromCharCode(codePoint));
+ }
+ else
+ {
+ result.push(String.fromCodePoint(codePoint));
+ }
break;
}
case '0':
@@ -398,65 +503,61 @@ function decodeChar(s, start, end, nextStart)
case '6':
case '7':
{
- const octalChars = "01234567";
- let val = 0;
- for(let j = 0; j < 3 && start < end; ++j)
+ // UTF-8 byte sequence encoded with octal escapes
+
+ let arr = [];
+ let done = false;
+ while(!done)
{
- const ch = s.charAt(start++);
- if(octalChars.indexOf(ch) == -1)
+ let val = 0;
+ for(let j = 0; j < 3 && start < end; ++j)
{
- --start;
- break;
+ let charVal = s.charCodeAt(start++) - '0'.charCodeAt(0);
+ if(charVal < 0 || charVal > 7)
+ {
+ --start;
+ if(j === 0)
+ {
+ // first character after escape is not 0-7:
+ done = true;
+ --start; // go back to the previous backslash
+ }
+ break; // for
+ }
+ val = val * 8 + charVal;
+ }
+
+ if(!done)
+ {
+ if(val > 255)
+ {
+ throw new Error("octal value \\" + val.toString(8) + " (" + val + ") is out of range");
+ }
+ arr.push(String.fromCharCode(val));
+
+ if((start + 1 < end) && s.charAt(start) === '\\')
+ {
+ start++;
+ // loop, read next octal escape sequence
+ }
+ else
+ {
+ done = true;
+ }
}
- val = val * 8 + parseInt(ch);
- }
- if(val > 255)
- {
- throw new Error("octal value \\" + val.toString(8) + " (" + val + ") is out of range");
}
- c = val;
+
+ // Decode UTF-8 arr into string
+ result.push(decodeURIComponent(escape(arr.join(""))));
break;
}
default:
{
- c = checkChar(s, start++);
+ result.push(checkChar(s, start++));
break;
}
}
}
- nextStart.value = start;
- return c;
-}
-//
-// Remove escape sequences from s and append the result to sb.
-// Return true if successful, false otherwise.
-//
-function decodeString(s, start, end, arr)
-{
- let nextStart = { 'value': 0 };
- while(start < end)
- {
- const c = decodeChar(s, start, end, nextStart);
- start = nextStart.value;
-
- if(c < 128)
- {
- arr.push(String.fromCharCode(c));
- }
- else if(c > 191 && c < 224)
- {
- const c2 = decodeChar(s, start, end, nextStart);
- start = nextStart.value;
- arr.push(String.fromCharCode(((c & 31) << 6) | (c2 & 63)));
- }
- else
- {
- const c2 = decodeChar(s, start, end, nextStart);
- start = nextStart.value;
- const c3 = decodeChar(s, start, end, nextStart);
- start = nextStart.value;
- arr.push(String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)));
- }
- }
+ return start;
}