summaryrefslogtreecommitdiff
path: root/cpp/src/Slice/Scanner.l
diff options
context:
space:
mode:
authorJose <jose@zeroc.com>2016-03-08 13:46:55 +0100
committerJose <jose@zeroc.com>2016-03-08 13:46:55 +0100
commit2bd402833bfdb54c1940dd0038be8af05d6f5e6f (patch)
treeeb7be3853dc45452397b730e586434f6e859efb3 /cpp/src/Slice/Scanner.l
parentWindows fixes for icegriddb/icestormdb (diff)
downloadice-2bd402833bfdb54c1940dd0038be8af05d6f5e6f.tar.bz2
ice-2bd402833bfdb54c1940dd0038be8af05d6f5e6f.tar.xz
ice-2bd402833bfdb54c1940dd0038be8af05d6f5e6f.zip
ICE-6991 - Add support for unicode escape sequences
Diffstat (limited to 'cpp/src/Slice/Scanner.l')
-rw-r--r--cpp/src/Slice/Scanner.l410
1 files changed, 264 insertions, 146 deletions
diff --git a/cpp/src/Slice/Scanner.l b/cpp/src/Slice/Scanner.l
index a5f8d439a2d..9054c1e90ed 100644
--- a/cpp/src/Slice/Scanner.l
+++ b/cpp/src/Slice/Scanner.l
@@ -13,6 +13,8 @@
#include <Slice/Grammar.h>
#include <IceUtil/InputUtil.h>
+#include <iomanip>
+
#include <stdlib.h>
#include <math.h>
@@ -239,156 +241,272 @@ floating_literal (({fractional_constant}{exponent_part}?)|((\+|-)?[[:digit:]]+{e
str->literal = "\"";
while(true)
{
- char c = static_cast<char>(yyinput());
+ char c = static_cast<char>(yyinput());
str->literal += c;
- if(c == '"')
- {
- break;
- }
- else if(c == EOF)
- {
- unit->error("EOF in string");
- break;
- }
- else if(c == '\n')
- {
- unit->error("newline in string");
- }
- else if(c == '\\')
- {
- char next = static_cast<char>(yyinput());
+ if(c == '"')
+ {
+ break;
+ }
+ else if(c == EOF)
+ {
+ unit->error("EOF in string");
+ break;
+ }
+ else if(c == '\n')
+ {
+ unit->error("newline in string");
+ }
+ else if(c == '\\')
+ {
+ char next = static_cast<char>(yyinput());
str->literal += next;
- switch(next)
- {
- case '\\':
- case '"':
- case '\'':
- {
- str->v += next;
- break;
- }
-
- case 'n':
- {
- str->v += '\n';
- break;
- }
-
- case 'r':
- {
- str->v += '\r';
- break;
- }
-
- case 't':
- {
- str->v += '\t';
- break;
- }
-
- case 'v':
- {
- str->v += '\v';
- break;
- }
-
- case 'f':
- {
- str->v += '\f';
- break;
- }
-
- case 'a':
- {
- str->v += '\a';
- break;
- }
-
- case 'b':
- {
- str->v += '\b';
- break;
- }
-
- case '?':
- {
- str->v += '\?';
- break;
- }
-
- case '0':
- case '1':
- case '2':
- case '3':
- {
- static string octalDigits = "01234567";
- unsigned short us = next - '0';
- if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos)
- {
+ switch(next)
+ {
+ case '\\':
+ {
+ str->v += '\\';
+ str->v += '\\';
+ break;
+ }
+ case '"':
+ case '\'':
+ {
+ str->v += next;
+ break;
+ }
+
+ case 'n':
+ {
+ str->v += '\n';
+ break;
+ }
+
+ case 'r':
+ {
+ str->v += '\r';
+ break;
+ }
+
+ case 't':
+ {
+ str->v += '\t';
+ break;
+ }
+
+ case 'v':
+ {
+ str->v += '\v';
+ break;
+ }
+
+ case 'f':
+ {
+ str->v += '\f';
+ break;
+ }
+
+ case 'a':
+ {
+ str->v += '\a';
+ break;
+ }
+
+ case 'b':
+ {
+ str->v += '\b';
+ break;
+ }
+
+ case '?':
+ {
+ str->v += '\?';
+ break;
+ }
+
+ //
+ // Octal value \nnn limited to three octal digits but terminate at the first character
+ // that is not a valid octal digit if encountered sooner.
+ //
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '7':
+ {
+ static string octalDigits = "01234567";
+ IceUtil::Int64 value = 0;
+ string escape;
+ escape += next;
+ for(int i = 0; i < 2; ++i)
+ {
+ next = static_cast<char>(yyinput());
+ if(octalDigits.find_first_of(next) == string::npos)
+ {
+ unput(next);
+ break;
+ }
+ escape += next;
+ }
+ str->literal += escape;
+ value = IceUtilInternal::strToInt64(escape.c_str(), 0, 8);
+
+ if(value == 0)
+ {
+ unit->error("illegal NUL character in string constant");
+ }
+ else if(value > 255)
+ {
+ ostringstream os;
+ os << "octal escape sequence out of range: '\\" << oct << value << "'";
+ unit->warning(os.str());
+ }
+ str->v += static_cast<char>(value);
+ break;
+ }
+ case 'x':
+ {
+ IceUtil::Int64 value = 0;
+ string escape = "";
+ while(isxdigit(static_cast<unsigned char>(next = static_cast<char>(yyinput()))))
+ {
+ escape += next;
+ }
+ unput(next);
+
+ str->literal += escape;
+ value = IceUtilInternal::strToInt64(escape.c_str(), 0, 16);
+
+ if(value == 0)
+ {
+ unit->error("illegal NUL character in string constant");
+ }
+ else if(value > 255)
+ {
+ ostringstream os;
+ os << "hex escape sequence out of range: '\\x" << hex << value << "'";
+ unit->warning(os.str());
+ }
+ str->v += static_cast<char>(value);
+ break;
+ }
+
+ //
+ // Universal character name \unnnn code point U+nnnn
+ //
+ case 'u':
+ {
+ IceUtil::Int64 value = 0;
+ string escape = "";
+
+ for(int i = 0; i < 4; ++i)
+ {
+ next = static_cast<char>(yyinput());
str->literal += next;
- us = us * 8 + next - '0';
- if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos)
- {
- us = us * 8 + next - '0';
- }
- else
- {
- unput(next);
- }
- }
- else
- {
- unput(next);
- }
- if(us == 0)
- {
- unit->error("illegal NUL character in string constant");
- }
- str->v += static_cast<char>(us);
- break;
- }
- case 'x':
- {
- IceUtil::Int64 ull = 0;
- while(isxdigit(static_cast<unsigned char>(next = static_cast<char>(yyinput()))))
- {
+ if(!isxdigit(static_cast<unsigned char>(next)))
+ {
+ unit->error("unknown escape sequence in string constant: " + str->literal);
+ break;
+ }
+ escape += next;
+ }
+
+ value = escape.size() == 4 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1;
+
+ ostringstream os;
+ os << '\\' << 'u';
+ os.fill('0');
+ os.width(4);
+ os << hex << value;
+
+ if(value == 0)
+ {
+ unit->error("illegal NUL character in string constant");
+ }
+
+
+ //
+ // Determine if a character is a surrogate:
+ //
+ // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive
+ // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive.
+ //
+ else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff))
+ {
+ unit->error("unknown escape sequence in string constant: '" + os.str() + "'");
+ }
+
+ str->v += os.str();
+
+ break;
+ }
+
+ case 'U':
+ {
+ IceUtil::Int64 value = 0;
+ string escape = "";
+
+ for(int i = 0; i < 8; ++i)
+ {
+ next = static_cast<char>(yyinput());
str->literal += next;
- ull *= 16;
- if(isdigit(static_cast<unsigned char>(next)))
- {
- ull += next - '0';
- }
- else if(islower(static_cast<unsigned char>(next)))
- {
- ull += next - 'a' + 10;
- }
- else
- {
- ull += next - 'A' + 10;
- }
- }
- unput(next);
- if(ull == 0)
- {
- unit->error("illegal NUL character in string constant");
- }
- str->v += static_cast<char>(ull);
- break;
- }
-
- // TODO: add universal character names
-
- default:
- {
- str->v += c;
- unput(next);
- }
- }
- }
- else
- {
- str->v += c;
- }
+ if(!isxdigit(static_cast<unsigned char>(next)))
+ {
+
+ unit->error("unknown escape sequence in string constant: " + str->literal);
+ break;
+ }
+ escape += next;
+ }
+
+ value = escape.size() == 8 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1;
+
+ ostringstream os;
+ os << '\\' << 'U';
+ os.fill('0');
+ os.width(8);
+ os << hex << value;
+
+ if(value == 0)
+ {
+ unit->error("illegal NUL character in string constant");
+ }
+
+ //
+ // Determine if a character is a surrogate:
+ //
+ // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive
+ // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive.
+ //
+ else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff))
+ {
+ unit->error("unknown escape sequence in string constant: '" + os.str() + "'");
+ }
+
+ str->v += os.str();
+ break;
+ }
+
+ default:
+ {
+ ostringstream os;
+ os << "unknown escape sequence '\\" << next << "'";
+ unit->warning(os.str());
+ //
+ // We escape the backslack in a unknown escape sequence
+ // to keep compativility with 3.6"
+ //
+ str->v += '\\';
+ str->v += c;
+ unput(next);
+ }
+ }
+ }
+ else
+ {
+ str->v += c;
+ }
}
*yylvalp = str;
return ICE_STRING_LITERAL;