diff options
author | Jose <jose@zeroc.com> | 2016-03-08 13:46:55 +0100 |
---|---|---|
committer | Jose <jose@zeroc.com> | 2016-03-08 13:46:55 +0100 |
commit | 2bd402833bfdb54c1940dd0038be8af05d6f5e6f (patch) | |
tree | eb7be3853dc45452397b730e586434f6e859efb3 /cpp/src/Slice/Scanner.l | |
parent | Windows fixes for icegriddb/icestormdb (diff) | |
download | ice-2bd402833bfdb54c1940dd0038be8af05d6f5e6f.tar.bz2 ice-2bd402833bfdb54c1940dd0038be8af05d6f5e6f.tar.xz ice-2bd402833bfdb54c1940dd0038be8af05d6f5e6f.zip |
ICE-6991 - Add support for unicode escape sequences
Diffstat (limited to 'cpp/src/Slice/Scanner.l')
-rw-r--r-- | cpp/src/Slice/Scanner.l | 410 |
1 files changed, 264 insertions, 146 deletions
diff --git a/cpp/src/Slice/Scanner.l b/cpp/src/Slice/Scanner.l index a5f8d439a2d..9054c1e90ed 100644 --- a/cpp/src/Slice/Scanner.l +++ b/cpp/src/Slice/Scanner.l @@ -13,6 +13,8 @@ #include <Slice/Grammar.h> #include <IceUtil/InputUtil.h> +#include <iomanip> + #include <stdlib.h> #include <math.h> @@ -239,156 +241,272 @@ floating_literal (({fractional_constant}{exponent_part}?)|((\+|-)?[[:digit:]]+{e str->literal = "\""; while(true) { - char c = static_cast<char>(yyinput()); + char c = static_cast<char>(yyinput()); str->literal += c; - if(c == '"') - { - break; - } - else if(c == EOF) - { - unit->error("EOF in string"); - break; - } - else if(c == '\n') - { - unit->error("newline in string"); - } - else if(c == '\\') - { - char next = static_cast<char>(yyinput()); + if(c == '"') + { + break; + } + else if(c == EOF) + { + unit->error("EOF in string"); + break; + } + else if(c == '\n') + { + unit->error("newline in string"); + } + else if(c == '\\') + { + char next = static_cast<char>(yyinput()); str->literal += next; - switch(next) - { - case '\\': - case '"': - case '\'': - { - str->v += next; - break; - } - - case 'n': - { - str->v += '\n'; - break; - } - - case 'r': - { - str->v += '\r'; - break; - } - - case 't': - { - str->v += '\t'; - break; - } - - case 'v': - { - str->v += '\v'; - break; - } - - case 'f': - { - str->v += '\f'; - break; - } - - case 'a': - { - str->v += '\a'; - break; - } - - case 'b': - { - str->v += '\b'; - break; - } - - case '?': - { - str->v += '\?'; - break; - } - - case '0': - case '1': - case '2': - case '3': - { - static string octalDigits = "01234567"; - unsigned short us = next - '0'; - if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos) - { + switch(next) + { + case '\\': + { + str->v += '\\'; + str->v += '\\'; + break; + } + case '"': + case '\'': + { + str->v += next; + break; + } + + case 'n': + { + str->v += '\n'; + break; + } + + case 'r': + { + str->v += '\r'; + break; + } + + case 't': + { + str->v += '\t'; + break; + } + + case 'v': + { + str->v += '\v'; + break; + } + + case 'f': + { + str->v += '\f'; + break; + } + + case 'a': + { + str->v += '\a'; + break; + } + + case 'b': + { + str->v += '\b'; + break; + } + + case '?': + { + str->v += '\?'; + break; + } + + // + // Octal value \nnn limited to three octal digits but terminate at the first character + // that is not a valid octal digit if encountered sooner. + // + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '7': + { + static string octalDigits = "01234567"; + IceUtil::Int64 value = 0; + string escape; + escape += next; + for(int i = 0; i < 2; ++i) + { + next = static_cast<char>(yyinput()); + if(octalDigits.find_first_of(next) == string::npos) + { + unput(next); + break; + } + escape += next; + } + str->literal += escape; + value = IceUtilInternal::strToInt64(escape.c_str(), 0, 8); + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + else if(value > 255) + { + ostringstream os; + os << "octal escape sequence out of range: '\\" << oct << value << "'"; + unit->warning(os.str()); + } + str->v += static_cast<char>(value); + break; + } + case 'x': + { + IceUtil::Int64 value = 0; + string escape = ""; + while(isxdigit(static_cast<unsigned char>(next = static_cast<char>(yyinput())))) + { + escape += next; + } + unput(next); + + str->literal += escape; + value = IceUtilInternal::strToInt64(escape.c_str(), 0, 16); + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + else if(value > 255) + { + ostringstream os; + os << "hex escape sequence out of range: '\\x" << hex << value << "'"; + unit->warning(os.str()); + } + str->v += static_cast<char>(value); + break; + } + + // + // Universal character name \unnnn code point U+nnnn + // + case 'u': + { + IceUtil::Int64 value = 0; + string escape = ""; + + for(int i = 0; i < 4; ++i) + { + next = static_cast<char>(yyinput()); str->literal += next; - us = us * 8 + next - '0'; - if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos) - { - us = us * 8 + next - '0'; - } - else - { - unput(next); - } - } - else - { - unput(next); - } - if(us == 0) - { - unit->error("illegal NUL character in string constant"); - } - str->v += static_cast<char>(us); - break; - } - case 'x': - { - IceUtil::Int64 ull = 0; - while(isxdigit(static_cast<unsigned char>(next = static_cast<char>(yyinput())))) - { + if(!isxdigit(static_cast<unsigned char>(next))) + { + unit->error("unknown escape sequence in string constant: " + str->literal); + break; + } + escape += next; + } + + value = escape.size() == 4 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1; + + ostringstream os; + os << '\\' << 'u'; + os.fill('0'); + os.width(4); + os << hex << value; + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + + + // + // Determine if a character is a surrogate: + // + // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive + // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive. + // + else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff)) + { + unit->error("unknown escape sequence in string constant: '" + os.str() + "'"); + } + + str->v += os.str(); + + break; + } + + case 'U': + { + IceUtil::Int64 value = 0; + string escape = ""; + + for(int i = 0; i < 8; ++i) + { + next = static_cast<char>(yyinput()); str->literal += next; - ull *= 16; - if(isdigit(static_cast<unsigned char>(next))) - { - ull += next - '0'; - } - else if(islower(static_cast<unsigned char>(next))) - { - ull += next - 'a' + 10; - } - else - { - ull += next - 'A' + 10; - } - } - unput(next); - if(ull == 0) - { - unit->error("illegal NUL character in string constant"); - } - str->v += static_cast<char>(ull); - break; - } - - // TODO: add universal character names - - default: - { - str->v += c; - unput(next); - } - } - } - else - { - str->v += c; - } + if(!isxdigit(static_cast<unsigned char>(next))) + { + + unit->error("unknown escape sequence in string constant: " + str->literal); + break; + } + escape += next; + } + + value = escape.size() == 8 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1; + + ostringstream os; + os << '\\' << 'U'; + os.fill('0'); + os.width(8); + os << hex << value; + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + + // + // Determine if a character is a surrogate: + // + // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive + // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive. + // + else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff)) + { + unit->error("unknown escape sequence in string constant: '" + os.str() + "'"); + } + + str->v += os.str(); + break; + } + + default: + { + ostringstream os; + os << "unknown escape sequence '\\" << next << "'"; + unit->warning(os.str()); + // + // We escape the backslack in a unknown escape sequence + // to keep compativility with 3.6" + // + str->v += '\\'; + str->v += c; + unput(next); + } + } + } + else + { + str->v += c; + } } *yylvalp = str; return ICE_STRING_LITERAL; |