diff options
author | Jose <jose@zeroc.com> | 2016-03-08 22:09:37 +0100 |
---|---|---|
committer | Jose <jose@zeroc.com> | 2016-03-08 22:09:37 +0100 |
commit | cfa587bc0be11012b9ab4e8fd333e96080eb71e2 (patch) | |
tree | 31ecc86af1a9a84d09eb3c62cd302830d02a74ce /cpp/src | |
parent | minor fixes to icegriddb/icestormdb (diff) | |
parent | more icegriddb/icestormdb fixes (diff) | |
download | ice-cfa587bc0be11012b9ab4e8fd333e96080eb71e2.tar.bz2 ice-cfa587bc0be11012b9ab4e8fd333e96080eb71e2.tar.xz ice-cfa587bc0be11012b9ab4e8fd333e96080eb71e2.zip |
Merge remote-tracking branch 'origin/3.6'
Diffstat (limited to 'cpp/src')
-rw-r--r-- | cpp/src/IceGrid/.gitignore | 2 | ||||
-rw-r--r-- | cpp/src/IceStorm/.gitignore | 2 | ||||
-rw-r--r-- | cpp/src/IceUtil/Unicode.cpp | 38 | ||||
-rw-r--r-- | cpp/src/IceUtil/Unicode.h | 9 | ||||
-rw-r--r-- | cpp/src/Slice/PythonUtil.cpp | 177 | ||||
-rw-r--r-- | cpp/src/Slice/RubyUtil.cpp | 176 | ||||
-rw-r--r-- | cpp/src/Slice/Scanner.cpp | 231 | ||||
-rw-r--r-- | cpp/src/Slice/Scanner.l | 181 | ||||
-rw-r--r-- | cpp/src/slice2cpp/Gen.cpp | 88 | ||||
-rw-r--r-- | cpp/src/slice2cpp/Makefile | 2 | ||||
-rw-r--r-- | cpp/src/slice2cs/Gen.cpp | 153 | ||||
-rw-r--r-- | cpp/src/slice2cs/Makefile | 2 | ||||
-rw-r--r-- | cpp/src/slice2java/Gen.cpp | 192 | ||||
-rw-r--r-- | cpp/src/slice2java/Makefile | 2 | ||||
-rw-r--r-- | cpp/src/slice2js/Gen.cpp | 141 | ||||
-rw-r--r-- | cpp/src/slice2js/Makefile | 2 | ||||
-rw-r--r-- | cpp/src/slice2php/Main.cpp | 93 | ||||
-rw-r--r-- | cpp/src/slice2php/Makefile | 2 |
18 files changed, 1229 insertions, 264 deletions
diff --git a/cpp/src/IceGrid/.gitignore b/cpp/src/IceGrid/.gitignore index dd791e621d9..41d881d4e8a 100644 --- a/cpp/src/IceGrid/.gitignore +++ b/cpp/src/IceGrid/.gitignore @@ -1,6 +1,8 @@ // Generated by makegitignore.py // IMPORTANT: Do not edit this file -- any edits made here will be lost! +DBTypes.cpp +DBTypes.h IceLocatorDiscovery.cpp IceLocatorDiscovery.h Internal.cpp diff --git a/cpp/src/IceStorm/.gitignore b/cpp/src/IceStorm/.gitignore index 6ede462d77f..4fad7b4af9d 100644 --- a/cpp/src/IceStorm/.gitignore +++ b/cpp/src/IceStorm/.gitignore @@ -1,6 +1,8 @@ // Generated by makegitignore.py // IMPORTANT: Do not edit this file -- any edits made here will be lost! +DBTypes.h +DBTypes.cpp Instrumentation.cpp Election.cpp IceStormInternal.cpp diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp index cae3476e277..7bad1d67c17 100644 --- a/cpp/src/IceUtil/Unicode.cpp +++ b/cpp/src/IceUtil/Unicode.cpp @@ -128,4 +128,42 @@ IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* s return result; } +ConversionResult +IceUtilInternal::convertUTF8ToUTF16(const vector<unsigned char>& source, vector<unsigned short>& target, ConversionFlags flags) +{ + target.resize(source.size()); + const unsigned char* sourceStart = &source[0]; + const unsigned char* sourceEnd = &source[0] + source.size(); + + unsigned short* targetStart = &target[0]; + unsigned short* targetEnd = &target[0] + target.size(); + ConversionResult result = ConvertUTF8toUTF16(&sourceStart, sourceEnd, &targetStart, targetEnd, flags); + + if(result == conversionOK) + { + target.resize(targetStart - &target[0]); + } + return result; +} + +ConversionResult +IceUtilInternal::convertUTF32ToUTF8(const vector<unsigned int>& source, vector<unsigned char>& target, ConversionFlags flags) +{ + target.resize(source.size() * 4); + + const unsigned int* sourceStart = &source[0]; + const unsigned int* sourceEnd = &source[0] + source.size(); + + unsigned char* targetStart = &target[0]; + unsigned char* targetEnd = &target[0] + target.size(); + ConversionResult result = ConvertUTF32toUTF8(&sourceStart, sourceEnd, &targetStart, targetEnd, flags); + + if(result == conversionOK) + { + target.resize(targetStart - &target[0]); + } + return result; +} + + diff --git a/cpp/src/IceUtil/Unicode.h b/cpp/src/IceUtil/Unicode.h index 00333ce8a44..2c96d6c6448 100644 --- a/cpp/src/IceUtil/Unicode.h +++ b/cpp/src/IceUtil/Unicode.h @@ -44,6 +44,15 @@ ConversionResult convertUTF8ToUTFWstring(const IceUtil::Byte*& sourceStart, const IceUtil::Byte* sourceEnd, std::wstring& target, IceUtil::ConversionFlags flags); + +ICE_UTIL_API ConversionResult +convertUTF8ToUTF16(const std::vector<unsigned char>&, std::vector<unsigned short>&, + IceUtil::ConversionFlags); + +ICE_UTIL_API ConversionResult +convertUTF32ToUTF8(const std::vector<unsigned int>&, std::vector<unsigned char>&, + IceUtil::ConversionFlags); + } #endif diff --git a/cpp/src/Slice/PythonUtil.cpp b/cpp/src/Slice/PythonUtil.cpp index a3abf0de03a..23b95898317 100644 --- a/cpp/src/Slice/PythonUtil.cpp +++ b/cpp/src/Slice/PythonUtil.cpp @@ -13,6 +13,7 @@ #include <IceUtil/IceUtil.h> #include <IceUtil/StringUtil.h> #include <IceUtil/InputUtil.h> +#include <IceUtil/Unicode.h> #include <climits> #include <iterator> @@ -1881,68 +1882,138 @@ Slice::Python::CodeVisitor::writeConstantValue(const TypePtr& type, const Syntax _out << "\""; // Opening " - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + for(size_t i = 0; i < value.size();) { - switch(*c) + char c = value[i]; + switch(c) { - case '"': - { - _out << "\\\""; - break; - } - case '\\': - { - _out << "\\\\"; - break; - } - case '\r': - { - _out << "\\r"; - break; - } - case '\n': - { - _out << "\\n"; - break; - } - case '\t': - { - _out << "\\t"; - break; - } - case '\b': - { - _out << "\\b"; - break; - } - case '\f': - { - _out << "\\f"; - break; - } - default: - { - if(charSet.find(*c) == charSet.end()) + case '"': { - unsigned char uc = *c; // Char may be signed, so make it positive. - stringstream s; - s << "\\"; // Print as octal if not in basic source character set. - s.flags(ios_base::oct); - s.width(3); - s.fill('0'); - s << static_cast<unsigned>(uc); - _out << s.str(); + _out << "\\\""; + break; } - else + case '\\': { - _out << *c; // Print normally if in basic source character set. + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\u00000041 - ['\\', 'u', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\u00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u')) + { + // + // Convert codepoint to UTF8 bytes and write the escaped bytes + // + _out << s.substr(0, s.size() - 1); + + size_t sz = value[j] == 'U' ? 8 : 4; + string codepoint = value.substr(j + 1, sz); + assert(codepoint.size() == sz); + + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + vector<unsigned int> u32buffer; + u32buffer.push_back(static_cast<unsigned int>(v)); + + vector<unsigned char> u8buffer; + IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + ostringstream s; + for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q) + { + s << "\\"; + s.fill('0'); + s.width(3); + s << oct; + s << static_cast<unsigned int>(*q); + } + _out << s.str(); + + i = j + 1 + sz; + } + else + { + _out << s; + i = j; + } + continue; + } + case '\r': + { + _out << "\\r"; + break; + } + case '\n': + { + _out << "\\n"; + break; + } + case '\t': + { + _out << "\\t"; + break; + } + case '\b': + { + _out << "\\b"; + break; + } + case '\f': + { + _out << "\\f"; + break; + } + default: + { + if(charSet.find(c) == charSet.end()) + { + unsigned char uc = c; // Char may be signed, so make it positive. + stringstream s; + s << "\\"; // Print as octal if not in basic source character set. + s.flags(ios_base::oct); + s.width(3); + s.fill('0'); + s << static_cast<unsigned>(uc); + _out << s.str(); + } + else + { + _out << c; // Print normally if in basic source character set. + } + break; } - break; - } } + ++i; } - _out << "\""; // Closing " + _out << "\""; // Closing " break; } case Slice::Builtin::KindValue: diff --git a/cpp/src/Slice/RubyUtil.cpp b/cpp/src/Slice/RubyUtil.cpp index 0c9cfd49bd5..2c066a8efbe 100644 --- a/cpp/src/Slice/RubyUtil.cpp +++ b/cpp/src/Slice/RubyUtil.cpp @@ -12,6 +12,7 @@ #include <Slice/Util.h> #include <IceUtil/Functional.h> #include <IceUtil/InputUtil.h> +#include <IceUtil/Unicode.h> #include <iterator> using namespace std; @@ -1472,68 +1473,137 @@ Slice::Ruby::CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTr _out << "\""; // Opening " - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + for(size_t i = 0; i < value.size();) { - switch(*c) + char c = value[i]; + switch(c) { - case '"': - { - _out << "\\\""; - break; - } - case '\\': - { - _out << "\\\\"; - break; - } - case '\r': - { - _out << "\\r"; - break; - } - case '\n': - { - _out << "\\n"; - break; - } - case '\t': - { - _out << "\\t"; - break; - } - case '\b': - { - _out << "\\b"; - break; - } - case '\f': - { - _out << "\\f"; - break; - } - default: - { - if(charSet.find(*c) == charSet.end()) + case '"': { - unsigned char uc = *c; // Char may be signed, so make it positive. - stringstream s; - s << "\\"; // Print as octal if not in basic source character set. - s.flags(ios_base::oct); - s.width(3); - s.fill('0'); - s << static_cast<unsigned>(uc); - _out << s.str(); + _out << "\\\""; + break; } - else + case '\\': { - _out << *c; // Print normally if in basic source character set. + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\u00000041 - ['\\', 'u', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\u00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u')) + { + // + // Convert codepoint to UTF8 bytes and write the escaped bytes + // + _out << s.substr(0, s.size() - 1); + + size_t sz = value[j] == 'U' ? 8 : 4; + string codepoint = value.substr(j + 1, sz); + assert(codepoint.size() == sz); + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + vector<unsigned int> u32buffer; + u32buffer.push_back(static_cast<unsigned int>(v)); + + vector<unsigned char> u8buffer; + IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + ostringstream s; + for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q) + { + s << "\\"; + s.fill('0'); + s.width(3); + s << oct; + s << static_cast<unsigned int>(*q); + } + _out << s.str(); + + i = j + 1 + sz; + } + else + { + _out << s; + i = j; + } + continue; + } + case '\r': + { + _out << "\\r"; + break; + } + case '\n': + { + _out << "\\n"; + break; + } + case '\t': + { + _out << "\\t"; + break; + } + case '\b': + { + _out << "\\b"; + break; + } + case '\f': + { + _out << "\\f"; + break; + } + default: + { + if(charSet.find(c) == charSet.end()) + { + unsigned char uc = c; // Char may be signed, so make it positive. + stringstream s; + s << "\\"; // Print as octal if not in basic source character set. + s.flags(ios_base::oct); + s.width(3); + s.fill('0'); + s << static_cast<unsigned>(uc); + _out << s.str(); + } + else + { + _out << c; // Print normally if in basic source character set. + } + break; } - break; - } } + ++i; } - _out << "\""; // Closing " + _out << "\".force_encoding(\"utf-8\")"; // Closing " break; } diff --git a/cpp/src/Slice/Scanner.cpp b/cpp/src/Slice/Scanner.cpp index a49c6c0433a..27d362e04c9 100644 --- a/cpp/src/Slice/Scanner.cpp +++ b/cpp/src/Slice/Scanner.cpp @@ -568,6 +568,8 @@ char *slice_text; #include <Slice/Grammar.h> #include <IceUtil/InputUtil.h> +#include <iomanip> + #include <stdlib.h> #include <math.h> @@ -630,7 +632,7 @@ int checkKeyword(string&); -#line 633 "lex.yy.c" +#line 635 "lex.yy.c" #define INITIAL 0 #define BOMSCAN 1 @@ -849,10 +851,10 @@ YY_DECL } { -#line 92 "Scanner.l" +#line 94 "Scanner.l" -#line 855 "lex.yy.c" +#line 857 "lex.yy.c" while ( 1 ) /* loops until end-of-file is reached */ { @@ -911,7 +913,7 @@ case 1: (yy_c_buf_p) = yy_cp -= 1; YY_DO_BEFORE_ACTION; /* set up slice_text again */ YY_RULE_SETUP -#line 94 "Scanner.l" +#line 96 "Scanner.l" { if(unit->scanPosition(slice_text)) { @@ -926,7 +928,7 @@ YY_LINENO_REWIND_TO(yy_cp - 1); (yy_c_buf_p) = yy_cp -= 1; YY_DO_BEFORE_ACTION; /* set up slice_text again */ YY_RULE_SETUP -#line 101 "Scanner.l" +#line 103 "Scanner.l" { if(unit->scanPosition(slice_text)) { @@ -939,7 +941,7 @@ case 3: (yy_c_buf_p) = yy_cp -= 1; YY_DO_BEFORE_ACTION; /* set up slice_text again */ YY_RULE_SETUP -#line 108 "Scanner.l" +#line 110 "Scanner.l" { if(unit->scanPosition(slice_text)) { @@ -954,7 +956,7 @@ YY_LINENO_REWIND_TO(yy_cp - 1); (yy_c_buf_p) = yy_cp -= 1; YY_DO_BEFORE_ACTION; /* set up slice_text again */ YY_RULE_SETUP -#line 115 "Scanner.l" +#line 117 "Scanner.l" { if(unit->scanPosition(slice_text)) { @@ -964,7 +966,7 @@ YY_RULE_SETUP YY_BREAK case 5: YY_RULE_SETUP -#line 122 "Scanner.l" +#line 124 "Scanner.l" { // C++-style comment BEGIN(MAINSCAN); @@ -982,7 +984,7 @@ YY_RULE_SETUP YY_BREAK case 6: YY_RULE_SETUP -#line 137 "Scanner.l" +#line 139 "Scanner.l" { // C-style comment BEGIN(MAINSCAN); @@ -1026,7 +1028,7 @@ YY_RULE_SETUP YY_BREAK case 7: YY_RULE_SETUP -#line 178 "Scanner.l" +#line 180 "Scanner.l" { BEGIN(MAINSCAN); return ICE_SCOPE_DELIMITER; @@ -1034,7 +1036,7 @@ YY_RULE_SETUP YY_BREAK case 8: YY_RULE_SETUP -#line 183 "Scanner.l" +#line 185 "Scanner.l" { BEGIN(MAINSCAN); return ICE_METADATA_OPEN; @@ -1042,7 +1044,7 @@ YY_RULE_SETUP YY_BREAK case 9: YY_RULE_SETUP -#line 188 "Scanner.l" +#line 190 "Scanner.l" { BEGIN(MAINSCAN); return ICE_METADATA_CLOSE; @@ -1050,7 +1052,7 @@ YY_RULE_SETUP YY_BREAK case 10: YY_RULE_SETUP -#line 193 "Scanner.l" +#line 195 "Scanner.l" { BEGIN(MAINSCAN); return ICE_GLOBAL_METADATA_OPEN; @@ -1058,7 +1060,7 @@ YY_RULE_SETUP YY_BREAK case 11: YY_RULE_SETUP -#line 198 "Scanner.l" +#line 200 "Scanner.l" { BEGIN(MAINSCAN); return ICE_GLOBAL_METADATA_CLOSE; @@ -1067,7 +1069,7 @@ YY_RULE_SETUP case 12: /* rule 12 can match eol */ YY_RULE_SETUP -#line 203 "Scanner.l" +#line 205 "Scanner.l" { BEGIN(MAINSCAN); StringTokPtr ident = new StringTok; @@ -1095,7 +1097,7 @@ YY_RULE_SETUP YY_BREAK case 13: YY_RULE_SETUP -#line 228 "Scanner.l" +#line 230 "Scanner.l" { BEGIN(MAINSCAN); StringTokPtr ident = new StringTok; @@ -1106,7 +1108,7 @@ YY_RULE_SETUP YY_BREAK case 14: YY_RULE_SETUP -#line 236 "Scanner.l" +#line 238 "Scanner.l" { BEGIN(MAINSCAN); StringTokPtr str = new StringTok; @@ -1135,32 +1137,42 @@ YY_RULE_SETUP switch(next) { case '\\': + { + str->v += '\\'; + str->v += '\\'; + break; + } case '"': case '\'': { str->v += next; break; } + case 'n': { str->v += '\n'; break; } + case 'r': { str->v += '\r'; break; } + case 't': { str->v += '\t'; break; } + case 'v': { str->v += '\v'; break; } + case 'f': { str->v += '\f'; @@ -1185,68 +1197,181 @@ YY_RULE_SETUP break; } + // + // Octal value \nnn limited to three octal digits but terminate at the first character + // that is not a valid octal digit if encountered sooner. + // case '0': case '1': case '2': case '3': + case '4': + case '5': + case '7': { static string octalDigits = "01234567"; - unsigned short us = next - '0'; - if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos) + IceUtil::Int64 value = 0; + string escape; + escape += next; + for(int i = 0; i < 2; ++i) { - str->literal += next; - us = us * 8 + next - '0'; - if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos) - { - us = us * 8 + next - '0'; - } - else + next = static_cast<char>(yyinput()); + if(octalDigits.find_first_of(next) == string::npos) { unput(next); + break; } + escape += next; } - else + str->literal += escape; + value = IceUtilInternal::strToInt64(escape.c_str(), 0, 8); + + if(value == 0) { - unput(next); + unit->error("illegal NUL character in string constant"); } - if(us == 0) + else if(value > 255) { - unit->error("illegal NUL character in string constant"); + ostringstream os; + os << "octal escape sequence out of range: '\\" << oct << value << "'"; + unit->warning(os.str()); } - str->v += static_cast<char>(us); + str->v += static_cast<char>(value); break; } case 'x': { - IceUtil::Int64 ull = 0; + IceUtil::Int64 value = 0; + string escape = ""; while(isxdigit(static_cast<unsigned char>(next = static_cast<char>(yyinput())))) { + escape += next; + } + unput(next); + + str->literal += escape; + value = IceUtilInternal::strToInt64(escape.c_str(), 0, 16); + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + else if(value > 255) + { + ostringstream os; + os << "hex escape sequence out of range: '\\x" << hex << value << "'"; + unit->warning(os.str()); + } + str->v += static_cast<char>(value); + break; + } + + // + // Universal character name \unnnn code point U+nnnn + // + case 'u': + { + IceUtil::Int64 value = 0; + string escape = ""; + + for(int i = 0; i < 4; ++i) + { + next = static_cast<char>(yyinput()); str->literal += next; - ull *= 16; - if(isdigit(static_cast<unsigned char>(next))) + if(!isxdigit(static_cast<unsigned char>(next))) { - ull += next - '0'; + unit->error("unknown escape sequence in string constant: " + str->literal); + break; } - else if(islower(static_cast<unsigned char>(next))) - { - ull += next - 'a' + 10; - } - else + escape += next; + } + + value = escape.size() == 4 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1; + + ostringstream os; + os << '\\' << 'u'; + os.fill('0'); + os.width(4); + os << hex << value; + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + + + // + // Determine if a character is a surrogate: + // + // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive + // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive. + // + else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff)) + { + unit->error("unknown escape sequence in string constant: '" + os.str() + "'"); + } + + str->v += os.str(); + + break; + } + + case 'U': + { + IceUtil::Int64 value = 0; + string escape = ""; + + for(int i = 0; i < 8; ++i) + { + next = static_cast<char>(yyinput()); + str->literal += next; + if(!isxdigit(static_cast<unsigned char>(next))) { - ull += next - 'A' + 10; + + unit->error("unknown escape sequence in string constant: " + str->literal); + break; } + escape += next; } - unput(next); - if(ull == 0) + + value = escape.size() == 8 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1; + + ostringstream os; + os << '\\' << 'U'; + os.fill('0'); + os.width(8); + os << hex << value; + + if(value == 0) { unit->error("illegal NUL character in string constant"); } - str->v += static_cast<char>(ull); + + // + // Determine if a character is a surrogate: + // + // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive + // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive. + // + else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff)) + { + unit->error("unknown escape sequence in string constant: '" + os.str() + "'"); + } + + str->v += os.str(); break; } - // TODO: add universal character names + default: { + ostringstream os; + os << "unknown escape sequence '\\" << next << "'"; + unit->warning(os.str()); + // + // We escape the backslack in a unknown escape sequence + // to keep compativility with 3.6" + // + str->v += '\\'; str->v += c; unput(next); } @@ -1263,7 +1388,7 @@ YY_RULE_SETUP YY_BREAK case 15: YY_RULE_SETUP -#line 390 "Scanner.l" +#line 515 "Scanner.l" { BEGIN(MAINSCAN); IntegerTokPtr itp = new IntegerTok; @@ -1282,7 +1407,7 @@ YY_RULE_SETUP YY_BREAK case 16: YY_RULE_SETUP -#line 406 "Scanner.l" +#line 531 "Scanner.l" { BEGIN(MAINSCAN); errno = 0; @@ -1316,7 +1441,7 @@ YY_RULE_SETUP case 17: /* rule 17 can match eol */ YY_RULE_SETUP -#line 436 "Scanner.l" +#line 561 "Scanner.l" { // Ignore white-space @@ -1332,7 +1457,7 @@ YY_RULE_SETUP YY_BREAK case 18: YY_RULE_SETUP -#line 449 "Scanner.l" +#line 574 "Scanner.l" { // Ignore UTF-8 BOM, rule only active when parsing start of file. @@ -1341,7 +1466,7 @@ YY_RULE_SETUP YY_BREAK case 19: YY_RULE_SETUP -#line 455 "Scanner.l" +#line 580 "Scanner.l" { BEGIN(MAINSCAN); if(slice_text[0] < 32 || slice_text[0] > 126) @@ -1360,10 +1485,10 @@ YY_RULE_SETUP YY_BREAK case 20: YY_RULE_SETUP -#line 471 "Scanner.l" +#line 596 "Scanner.l" ECHO; YY_BREAK -#line 1366 "lex.yy.c" +#line 1491 "lex.yy.c" case YY_STATE_EOF(INITIAL): case YY_STATE_EOF(BOMSCAN): case YY_STATE_EOF(MAINSCAN): @@ -2364,7 +2489,7 @@ void slice_free (void * ptr ) #define YYTABLES_NAME "yytables" -#line 470 "Scanner.l" +#line 595 "Scanner.l" diff --git a/cpp/src/Slice/Scanner.l b/cpp/src/Slice/Scanner.l index 190c00bcf5c..a9c381b7260 100644 --- a/cpp/src/Slice/Scanner.l +++ b/cpp/src/Slice/Scanner.l @@ -13,6 +13,8 @@ #include <Slice/Grammar.h> #include <IceUtil/InputUtil.h> +#include <iomanip> + #include <stdlib.h> #include <math.h> @@ -261,32 +263,42 @@ floating_literal (({fractional_constant}{exponent_part}?)|((\+|-)?[[:digit:]] switch(next) { case '\\': + { + str->v += '\\'; + str->v += '\\'; + break; + } case '"': case '\'': { str->v += next; break; } + case 'n': { str->v += '\n'; break; } + case 'r': { str->v += '\r'; break; } + case 't': { str->v += '\t'; break; } + case 'v': { str->v += '\v'; break; } + case 'f': { str->v += '\f'; @@ -311,68 +323,181 @@ floating_literal (({fractional_constant}{exponent_part}?)|((\+|-)?[[:digit:]] break; } + // + // Octal value \nnn limited to three octal digits but terminate at the first character + // that is not a valid octal digit if encountered sooner. + // case '0': case '1': case '2': case '3': + case '4': + case '5': + case '7': { static string octalDigits = "01234567"; - unsigned short us = next - '0'; - if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos) + IceUtil::Int64 value = 0; + string escape; + escape += next; + for(int i = 0; i < 2; ++i) { - str->literal += next; - us = us * 8 + next - '0'; - if(octalDigits.find_first_of(next = static_cast<char>(yyinput())) != string::npos) - { - us = us * 8 + next - '0'; - } - else + next = static_cast<char>(yyinput()); + if(octalDigits.find_first_of(next) == string::npos) { unput(next); + break; } + escape += next; } - else + str->literal += escape; + value = IceUtilInternal::strToInt64(escape.c_str(), 0, 8); + + if(value == 0) { - unput(next); + unit->error("illegal NUL character in string constant"); } - if(us == 0) + else if(value > 255) { - unit->error("illegal NUL character in string constant"); + ostringstream os; + os << "octal escape sequence out of range: '\\" << oct << value << "'"; + unit->warning(os.str()); } - str->v += static_cast<char>(us); + str->v += static_cast<char>(value); break; } case 'x': { - IceUtil::Int64 ull = 0; + IceUtil::Int64 value = 0; + string escape = ""; while(isxdigit(static_cast<unsigned char>(next = static_cast<char>(yyinput())))) { + escape += next; + } + unput(next); + + str->literal += escape; + value = IceUtilInternal::strToInt64(escape.c_str(), 0, 16); + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + else if(value > 255) + { + ostringstream os; + os << "hex escape sequence out of range: '\\x" << hex << value << "'"; + unit->warning(os.str()); + } + str->v += static_cast<char>(value); + break; + } + + // + // Universal character name \unnnn code point U+nnnn + // + case 'u': + { + IceUtil::Int64 value = 0; + string escape = ""; + + for(int i = 0; i < 4; ++i) + { + next = static_cast<char>(yyinput()); str->literal += next; - ull *= 16; - if(isdigit(static_cast<unsigned char>(next))) - { - ull += next - '0'; - } - else if(islower(static_cast<unsigned char>(next))) + if(!isxdigit(static_cast<unsigned char>(next))) { - ull += next - 'a' + 10; + unit->error("unknown escape sequence in string constant: " + str->literal); + break; } - else + escape += next; + } + + value = escape.size() == 4 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1; + + ostringstream os; + os << '\\' << 'u'; + os.fill('0'); + os.width(4); + os << hex << value; + + if(value == 0) + { + unit->error("illegal NUL character in string constant"); + } + + + // + // Determine if a character is a surrogate: + // + // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive + // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive. + // + else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff)) + { + unit->error("unknown escape sequence in string constant: '" + os.str() + "'"); + } + + str->v += os.str(); + + break; + } + + case 'U': + { + IceUtil::Int64 value = 0; + string escape = ""; + + for(int i = 0; i < 8; ++i) + { + next = static_cast<char>(yyinput()); + str->literal += next; + if(!isxdigit(static_cast<unsigned char>(next))) { - ull += next - 'A' + 10; + + unit->error("unknown escape sequence in string constant: " + str->literal); + break; } + escape += next; } - unput(next); - if(ull == 0) + + value = escape.size() == 8 ? IceUtilInternal::strToInt64(escape.c_str(), 0, 16) : -1; + + ostringstream os; + os << '\\' << 'U'; + os.fill('0'); + os.width(8); + os << hex << value; + + if(value == 0) { unit->error("illegal NUL character in string constant"); } - str->v += static_cast<char>(ull); + + // + // Determine if a character is a surrogate: + // + // * High surrogate code point, ranging from 0xd800 to 0xdbff, inclusive + // * Low surrogate code point, ranging from 0xdc00 to 0xdfff, inclusive. + // + else if((value >= 0xd800 && value <= 0xdbff) || (value >= 0xdc00 && value <= 0xdfff)) + { + unit->error("unknown escape sequence in string constant: '" + os.str() + "'"); + } + + str->v += os.str(); break; } - // TODO: add universal character names + default: { + ostringstream os; + os << "unknown escape sequence '\\" << next << "'"; + unit->warning(os.str()); + // + // We escape the backslack in a unknown escape sequence + // to keep compativility with 3.6" + // + str->v += '\\'; str->v += c; unput(next); } diff --git a/cpp/src/slice2cpp/Gen.cpp b/cpp/src/slice2cpp/Gen.cpp index 98dc43e28aa..c90dc87088b 100644 --- a/cpp/src/slice2cpp/Gen.cpp +++ b/cpp/src/slice2cpp/Gen.cpp @@ -13,6 +13,8 @@ #include <Slice/CPlusPlusUtil.h> #include <IceUtil/Functional.h> #include <IceUtil/Iterator.h> +#include <IceUtil/InputUtil.h> +#include <IceUtil/Unicode.h> #include <Slice/Checksum.h> #include <Slice/FileTracker.h> @@ -126,11 +128,11 @@ writeConstantValue(IceUtilInternal::Output& out, const TypePtr& type, const Synt } out << "\""; // Opening " - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + for(size_t i = 0; i < value.size();) { - if(charSet.find(*c) == charSet.end()) + if(charSet.find(value[i]) == charSet.end()) { - unsigned char uc = *c; // char may be signed, so make it positive + unsigned char uc = value[i]; // char may be signed, so make it positive ostringstream s; s << "\\"; // Print as octal if not in basic source character set s.width(3); @@ -141,17 +143,93 @@ writeConstantValue(IceUtilInternal::Output& out, const TypePtr& type, const Synt } else { - switch(*c) + switch(value[i]) { case '\\': + { + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\U00000041 - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u')) + { + // + // Convert codepoint to UTF8 bytes and write the escaped bytes + // + out << s.substr(0, s.size() - 1); + + size_t sz = value[j] == 'U' ? 8 : 4; + string codepoint = value.substr(j + 1, sz); + assert(codepoint.size() == sz); + + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + + vector<unsigned int> u32buffer; + u32buffer.push_back(static_cast<unsigned int>(v)); + + vector<unsigned char> u8buffer; + + IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + ostringstream s; + for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q) + { + s << "\\"; + s.fill('0'); + s.width(3); + s << oct; + s << static_cast<unsigned int>(*q); + } + out << s.str(); + + i = j + 1 + sz; + } + else + { + out << s; + i = j; + } + continue; + } case '"': { out << "\\"; break; } } - out << *c; // Print normally if in basic source character set + + out << value[i]; // Print normally if in basic source character set } + ++i; } out << "\""; // Closing " diff --git a/cpp/src/slice2cpp/Makefile b/cpp/src/slice2cpp/Makefile index 790ad0aecc3..3af754207f8 100644 --- a/cpp/src/slice2cpp/Makefile +++ b/cpp/src/slice2cpp/Makefile @@ -20,7 +20,7 @@ RPATH_DIR = $(LOADER_PATH)/../$(libsubdir) include $(top_srcdir)/config/Make.rules -CPPFLAGS := -I. $(CPPFLAGS) +CPPFLAGS := -I. -I.. $(CPPFLAGS) $(NAME): $(OBJS) rm -f $@ diff --git a/cpp/src/slice2cs/Gen.cpp b/cpp/src/slice2cs/Gen.cpp index 50076dc087c..85f10179740 100644 --- a/cpp/src/slice2cs/Gen.cpp +++ b/cpp/src/slice2cs/Gen.cpp @@ -10,7 +10,9 @@ #include <IceUtil/DisableWarnings.h> #include <IceUtil/Functional.h> #include <IceUtil/StringUtil.h> -#include "Gen.h" +#include <IceUtil/InputUtil.h> +#include <Gen.h> + #include <limits> #include <sys/stat.h> #ifndef _WIN32 @@ -20,6 +22,7 @@ #endif #include <IceUtil/Iterator.h> #include <IceUtil/UUID.h> +#include <IceUtil/Unicode.h> #include <Slice/Checksum.h> #include <Slice/DotNetNames.h> #include <Slice/FileTracker.h> @@ -35,6 +38,45 @@ namespace { string +u16CodePoint(unsigned short value) +{ + ostringstream s; + s << "\\u"; + s << hex; + s.width(4); + s.fill('0'); + s << value; + return s.str(); +} + + +void +writeU8Buffer(const vector<unsigned char>& u8buffer, ::IceUtilInternal::Output& out) +{ + vector<unsigned short> u16buffer; + IceUtilInternal::ConversionResult result = convertUTF8ToUTF16(u8buffer, u16buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + for(vector<unsigned short>::const_iterator c = u16buffer.begin(); c != u16buffer.end(); ++c) + { + out << u16CodePoint(*c); + } +} + +string sliceModeToIceMode(Operation::Mode opMode) { string mode; @@ -1544,39 +1586,118 @@ Slice::CsVisitor::writeConstantValue(const TypePtr& type, const SyntaxTreeBasePt // here because they are sensitive to the current locale. // static const string basicSourceChars = "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "0123456789" - "_{}[]#()<>%:;.?*+-/^&|~!=,\\\"' "; + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789" + "_{}[]#()<>%:;.?*+-/^&|~!=,\\\"' "; + static const set<char> charSet(basicSourceChars.begin(), basicSourceChars.end()); _out << "\""; // Opening " - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + vector<unsigned char> u8buffer; // Buffer to convert multibyte characters + + for(size_t i = 0; i < value.size();) { - if(charSet.find(*c) == charSet.end()) + if(charSet.find(value[i]) == charSet.end()) { - unsigned char uc = *c; // char may be signed, so make it positive - ostringstream s; - s << "\\u"; // Print as unicode if not in basic source character set - s << hex; - s.width(4); - s.fill('0'); - s << static_cast<unsigned>(uc); - _out << s.str(); + if(static_cast<unsigned char>(value[i]) < 128) // Single byte character + { + // + // Print as unicode if not in basic source character set + // + _out << u16CodePoint(static_cast<unsigned int>(value[i])); + } + else + { + u8buffer.push_back(value[i]); + } } else { - switch(*c) + // + // Write any pedding characters in the utf8 buffer + // + if(!u8buffer.empty()) + { + writeU8Buffer(u8buffer, _out); + u8buffer.clear(); + } + switch(value[i]) { case '\\': + { + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\U00000041 - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && value[j] == 'U') + { + _out << s.substr(0, s.size() - 1); + i = j + 1; + + string codepoint = value.substr(j + 1, 8); + assert(codepoint.size() == 8); + + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + + // + // Unicode character in the range U+10000 to U+10FFFF is not permitted in a character literal + // and is represented using a Unicode surrogate pair. + // + if(v > 0xFFFF) + { + unsigned int high = ((static_cast<unsigned int>(v) - 0x10000) / 0x400) + 0xD800; + unsigned int low = ((static_cast<unsigned int>(v) - 0x10000) % 0x400) + 0xDC00; + _out << u16CodePoint(high); + _out << u16CodePoint(low); + } + else + { + _out << "\\U" << codepoint; + } + + i = j + 1 + 8; + } + else + { + _out << s; + i = j; + } + continue; + } case '"': { _out << "\\"; break; } } - _out << *c; // Print normally if in basic source character set + _out << value[i]; // Print normally if in basic source character set } + i++; + } + + // + // Write any pedding characters in the utf8 buffer + // + if(!u8buffer.empty()) + { + writeU8Buffer(u8buffer, _out); + u8buffer.clear(); } _out << "\""; // Closing " diff --git a/cpp/src/slice2cs/Makefile b/cpp/src/slice2cs/Makefile index e51e24c0445..e46c1005dd6 100644 --- a/cpp/src/slice2cs/Makefile +++ b/cpp/src/slice2cs/Makefile @@ -20,7 +20,7 @@ RPATH_DIR = $(LOADER_PATH)/../$(libsubdir) include $(top_srcdir)/config/Make.rules -CPPFLAGS := -I. $(CPPFLAGS) +CPPFLAGS := -I. -I.. $(CPPFLAGS) $(NAME): $(OBJS) rm -f $@ diff --git a/cpp/src/slice2java/Gen.cpp b/cpp/src/slice2java/Gen.cpp index 4c5cf6f2477..038568cf64c 100644 --- a/cpp/src/slice2java/Gen.cpp +++ b/cpp/src/slice2java/Gen.cpp @@ -14,6 +14,7 @@ #include <IceUtil/Iterator.h> #include <IceUtil/StringUtil.h> #include <IceUtil/InputUtil.h> +#include <IceUtil/Unicode.h> #include <cstring> #include <limits> @@ -23,6 +24,44 @@ using namespace Slice; using namespace IceUtil; using namespace IceUtilInternal; +string +u16CodePoint(unsigned short value) +{ + ostringstream s; + s << "\\u"; + s << hex; + s.width(4); + s.fill('0'); + s << value; + return s.str(); +} + +void +writeU8Buffer(const vector<unsigned char>& u8buffer, ::IceUtilInternal::Output& out) +{ + vector<unsigned short> u16buffer; + IceUtilInternal::ConversionResult result = convertUTF8ToUTF16(u8buffer, u16buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + for(vector<unsigned short>::const_iterator c = u16buffer.begin(); c != u16buffer.end(); ++c) + { + out << u16CodePoint(*c); + } +} + static string sliceModeToIceMode(Operation::Mode opMode) { @@ -1729,53 +1768,152 @@ Slice::JavaVisitor::writeConstantValue(Output& out, const TypePtr& type, const S static const set<char> charSet(basicSourceChars.begin(), basicSourceChars.end()); out << "\""; - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + vector<unsigned char> u8buffer; // Buffer to convert multibyte characters + + for(size_t i = 0; i < value.size();) { - if(charSet.find(*c) == charSet.end()) + if(charSet.find(value[i]) == charSet.end()) { - switch(*c) + char c = value[i]; + if(static_cast<unsigned char>(c) < 128) // Single byte character { // - // Java doesn't want '\n' or '\r\n' encoded as universal - // characters, that gives an error "unclosed string literal" + // Print as unicode if not in basic source character set // - case '\r': + switch(c) { - out << "\\r"; - break; - } - case '\n': - { - out << "\\n"; - break; - } - default: - { - unsigned char uc = *c; - ostringstream s; - s << "\\u"; - s.flags(ios_base::hex); - s.width(4); - s.fill('0'); - s << static_cast<unsigned>(uc); - out << s.str(); - break; + // + // Java doesn't want '\n' or '\r\n' encoded as universal + // characters, that gives an error "unclosed string literal" + // + case '\r': + { + out << "\\r"; + break; + } + case '\n': + { + out << "\\n"; + break; + } + default: + { + out << u16CodePoint(c); + break; + } } } + else + { + u8buffer.push_back(value[i]); + } } else { - switch(*c) + // + // Write any pedding characters in the utf8 buffer + // + if(!u8buffer.empty()) + { + writeU8Buffer(u8buffer, out); + u8buffer.clear(); + } + switch(value[i]) { case '\\': + { + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\U00000041 - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u')) + { + size_t sz = value[j] == 'U' ? 8 : 4; + out << s.substr(0, s.size() - 1); + i = j + 1; + + string codepoint = value.substr(j + 1, sz); + assert(codepoint.size() == sz); + + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + + // + // Java doesn't like this special characters encoded as universal characters + // + if(v == 0x5c) + { + out << "\\\\"; + } + else if(v == 0xa) + { + out << "\\n"; + } + else if(v == 0xd) + { + out << "\\r"; + } + else if(v == 0x22) + { + out << "\\\""; + } + // + // Unicode character in the range U+10000 to U+10FFFF is not permitted in a character literal + // and is represented using a Unicode surrogate pair. + // + else if(v > 0xFFFF) + { + unsigned int high = ((static_cast<unsigned int>(v) - 0x10000) / 0x400) + 0xD800; + unsigned int low = ((static_cast<unsigned int>(v) - 0x10000) % 0x400) + 0xDC00; + out << u16CodePoint(high); + out << u16CodePoint(low); + } + else + { + out << u16CodePoint(static_cast<unsigned int>(v)); + } + + i = j + 1 + sz; + } + else + { + out << s; + i = j; + } + continue; + } case '"': { out << "\\"; break; } } - out << *c; + out << value[i]; // Print normally if in basic source character set } + i++; + } + + // + // Write any pedding characters in the utf8 buffer + // + if(!u8buffer.empty()) + { + writeU8Buffer(u8buffer, out); + u8buffer.clear(); } out << "\""; diff --git a/cpp/src/slice2java/Makefile b/cpp/src/slice2java/Makefile index 010554d8e12..e2a882bfb6f 100644 --- a/cpp/src/slice2java/Makefile +++ b/cpp/src/slice2java/Makefile @@ -20,7 +20,7 @@ RPATH_DIR = $(LOADER_PATH)/../$(libsubdir) include $(top_srcdir)/config/Make.rules -CPPFLAGS := -I. $(CPPFLAGS) +CPPFLAGS := -I. -I.. $(CPPFLAGS) $(NAME): $(OBJS) rm -f $@ diff --git a/cpp/src/slice2js/Gen.cpp b/cpp/src/slice2js/Gen.cpp index 11bd0f608e5..573d0180c67 100644 --- a/cpp/src/slice2js/Gen.cpp +++ b/cpp/src/slice2js/Gen.cpp @@ -20,6 +20,7 @@ #include <direct.h> #endif #include <IceUtil/Iterator.h> +#include <IceUtil/Unicode.h> #include <IceUtil/UUID.h> #include <Slice/Checksum.h> #include <Slice/FileTracker.h> @@ -35,6 +36,44 @@ namespace { string +u16CodePoint(unsigned short value) +{ + ostringstream s; + s << "\\u"; + s << hex; + s.width(4); + s.fill('0'); + s << value; + return s.str(); +} + +void +writeU8Buffer(const vector<unsigned char>& u8buffer, ::IceUtilInternal::Output& out) +{ + vector<unsigned short> u16buffer; + IceUtilInternal::ConversionResult result = convertUTF8ToUTF16(u8buffer, u16buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + for(vector<unsigned short>::const_iterator c = u16buffer.begin(); c != u16buffer.end(); ++c) + { + out << u16CodePoint(*c); + } +} + +string sliceModeToIceMode(Operation::Mode opMode) { switch(opMode) @@ -479,32 +518,110 @@ Slice::JsVisitor::writeConstantValue(const string& scope, const TypePtr& type, c _out << "\""; // Opening " - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + vector<unsigned char> u8buffer; // Buffer to convert multibyte characters + + for(size_t i = 0; i < value.size();) { - if(charSet.find(*c) == charSet.end()) + if(charSet.find(value[i]) == charSet.end()) { - unsigned char uc = *c; // char may be signed, so make it positive - ostringstream s; - s << "\\u"; // Print as unicode if not in basic source character set - s << hex; - s.width(4); - s.fill('0'); - s << static_cast<unsigned>(uc); - _out << s.str(); + if(static_cast<unsigned char>(value[i]) < 128) // Single byte character + { + // + // Print as unicode if not in basic source character set + // + _out << u16CodePoint(static_cast<unsigned int>(value[i])); + } + else + { + u8buffer.push_back(value[i]); + } } else { - switch(*c) + // + // Write any pedding characters in the utf8 buffer + // + if(!u8buffer.empty()) + { + writeU8Buffer(u8buffer, _out); + u8buffer.clear(); + } + switch(value[i]) { case '\\': + { + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\U00000041 - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && value[j] == 'U') + { + _out << s.substr(0, s.size() - 1); + i = j + 1; + + string codepoint = value.substr(j + 1, 8); + assert(codepoint.size() == 8); + + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + + // + // Unicode character in the range U+10000 to U+10FFFF is not permitted in a character literal + // and is represented using a Unicode surrogate pair. + // + if(v > 0xFFFF) + { + unsigned int high = ((static_cast<unsigned int>(v) - 0x10000) / 0x400) + 0xD800; + unsigned int low = ((static_cast<unsigned int>(v) - 0x10000) % 0x400) + 0xDC00; + _out << u16CodePoint(high); + _out << u16CodePoint(low); + } + else + { + _out << u16CodePoint(static_cast<unsigned int>(v)); + } + + i = j + 1 + 8; + } + else + { + _out << s; + i = j; + } + continue; + } case '"': { _out << "\\"; break; } } - _out << *c; // Print normally if in basic source character set + _out << value[i]; // Print normally if in basic source character set } + i++; + } + + // + // Write any pedding characters in the utf8 buffer + // + if(!u8buffer.empty()) + { + writeU8Buffer(u8buffer, _out); + u8buffer.clear(); } _out << "\""; // Closing " diff --git a/cpp/src/slice2js/Makefile b/cpp/src/slice2js/Makefile index bd1bbe967f8..0aaf14a7a0d 100644 --- a/cpp/src/slice2js/Makefile +++ b/cpp/src/slice2js/Makefile @@ -21,7 +21,7 @@ RPATH_DIR = $(LOADER_PATH)/../$(libsubdir) include $(top_srcdir)/config/Make.rules -CPPFLAGS := -I. $(CPPFLAGS) +CPPFLAGS := -I. -I.. $(CPPFLAGS) $(NAME): $(OBJS) rm -f $@ diff --git a/cpp/src/slice2php/Main.cpp b/cpp/src/slice2php/Main.cpp index 2838863454a..35117e68f36 100644 --- a/cpp/src/slice2php/Main.cpp +++ b/cpp/src/slice2php/Main.cpp @@ -16,6 +16,7 @@ #include <IceUtil/StringUtil.h> #include <IceUtil/Mutex.h> #include <IceUtil/MutexPtrLock.h> +#include <IceUtil/Unicode.h> #include <Slice/Checksum.h> #include <Slice/Preprocessor.h> #include <Slice/FileTracker.h> @@ -1273,9 +1274,10 @@ CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTreeBasePtr& va _out << "\""; // Opening " - for(string::const_iterator c = value.begin(); c != value.end(); ++c) + for(size_t i = 0; i < value.size();) { - switch(*c) + char c = value[i]; + switch(c) { case '$': { @@ -1289,8 +1291,79 @@ CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTreeBasePtr& va } case '\\': { - _out << "\\\\"; - break; + + string s = "\\"; + size_t j = i + 1; + for(; j < value.size(); ++j) + { + if(value[j] != '\\') + { + break; + } + s += "\\"; + } + + // + // An even number of slash \ will escape the backslash and + // the codepoint will be interpreted as its charaters + // + // \\u00000041 - ['\\', 'u', '0', '0', '0', '0', '0', '0', '4', '1'] + // \\\u00000041 - ['\\', 'A'] (41 is the codepoint for 'A') + // + if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u')) + { + // + // Convert codepoint to UTF8 bytes and write the escaped bytes + // + _out << s.substr(0, s.size() - 1); + + size_t sz = value[j] == 'U' ? 8 : 4; + string codepoint = value.substr(j + 1, sz); + assert(codepoint.size() == sz); + + IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16); + + + vector<unsigned int> u32buffer; + u32buffer.push_back(static_cast<unsigned int>(v)); + + vector<unsigned char> u8buffer; + + IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion); + switch(result) + { + case conversionOK: + break; + case sourceExhausted: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted"); + case sourceIllegal: + throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal"); + default: + { + assert(0); + throw IceUtil::IllegalConversionException(__FILE__, __LINE__); + } + } + + ostringstream s; + for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q) + { + s << "\\"; + s.fill('0'); + s.width(3); + s << oct; + s << static_cast<unsigned int>(*q); + } + _out << s.str(); + + i = j + 1 + sz; + } + else + { + _out << s; + i = j; + } + continue; } case '\r': { @@ -1307,11 +1380,6 @@ CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTreeBasePtr& va _out << "\\t"; break; } - case '\b': - { - _out << "\\b"; - break; - } case '\f': { _out << "\\f"; @@ -1319,9 +1387,9 @@ CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTreeBasePtr& va } default: { - if(charSet.find(*c) == charSet.end()) + if(charSet.find(c) == charSet.end()) { - unsigned char uc = *c; // Char may be signed, so make it positive. + unsigned char uc = c; // Char may be signed, so make it positive. stringstream s; s << "\\"; // Print as octal if not in basic source character set. s.flags(ios_base::oct); @@ -1332,11 +1400,12 @@ CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTreeBasePtr& va } else { - _out << *c; // Print normally if in basic source character set. + _out << c; // Print normally if in basic source character set. } break; } } + ++i; } _out << "\""; // Closing " diff --git a/cpp/src/slice2php/Makefile b/cpp/src/slice2php/Makefile index 8bc91a8cd66..62f82531201 100644 --- a/cpp/src/slice2php/Makefile +++ b/cpp/src/slice2php/Makefile @@ -19,7 +19,7 @@ RPATH_DIR = $(LOADER_PATH)/../$(libsubdir) include $(top_srcdir)/config/Make.rules -CPPFLAGS := -I. $(CPPFLAGS) +CPPFLAGS := -I. -I.. $(CPPFLAGS) $(NAME): $(OBJS) rm -f $@ |