6 files changed, 296 insertions, 88 deletions
diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp
index 7bad1d67c17..ca36a912b47 100644
--- a/cpp/src/IceUtil/Unicode.cpp
+++ b/cpp/src/IceUtil/Unicode.cpp
@@ -147,6 +147,24 @@ IceUtilInternal::convertUTF8ToUTF16(const vector<unsigned char>& source, vector<
 }
 
 ConversionResult
+IceUtilInternal::convertUTF8ToUTF32(const vector<unsigned char>& source, vector<unsigned int>& target, ConversionFlags flags)
+{
+    target.resize(source.size());
+    const unsigned char* sourceStart = &source[0];
+    const unsigned char* sourceEnd = &source[0] + source.size();
+    
+    unsigned int* targetStart = &target[0]; 
+    unsigned int* targetEnd = &target[0] + target.size();
+    ConversionResult result = ConvertUTF8toUTF32(&sourceStart, sourceEnd, &targetStart, targetEnd, flags);
+    
+    if(result == conversionOK)
+    {
+        target.resize(targetStart - &target[0]);
+    }
+    return result;
+}
+
+ConversionResult
 IceUtilInternal::convertUTF32ToUTF8(const vector<unsigned int>& source, vector<unsigned char>& target, ConversionFlags flags)
 {
     target.resize(source.size() * 4);
diff --git a/cpp/src/IceUtil/Unicode.h b/cpp/src/IceUtil/Unicode.h
index 2c96d6c6448..d5c3b235ddb 100644
--- a/cpp/src/IceUtil/Unicode.h
+++ b/cpp/src/IceUtil/Unicode.h
@@ -50,6 +50,10 @@ convertUTF8ToUTF16(const std::vector<unsigned char>&, std::vector<unsigned short
                    IceUtil::ConversionFlags);
 
 ICE_UTIL_API ConversionResult
+convertUTF8ToUTF32(const std::vector<unsigned char>&, std::vector<unsigned int>&,
+                   IceUtil::ConversionFlags);
+
+ICE_UTIL_API ConversionResult
 convertUTF32ToUTF8(const std::vector<unsigned int>&, std::vector<unsigned char>&,
                    IceUtil::ConversionFlags);
 
diff --git a/cpp/src/Slice/Ruby.cpp b/cpp/src/Slice/Ruby.cpp
index 62daa345cbf..209711a679d 100644
--- a/cpp/src/Slice/Ruby.cpp
+++ b/cpp/src/Slice/Ruby.cpp
@@ -301,7 +301,10 @@ Slice::Ruby::compile(int argc, char* argv[])
                             throw FileException(__FILE__, __LINE__, os.str());
                         }
                         FileTracker::instance()->addFile(file);
-
+                        //
+                        // Ruby magic comment to set the file encoding, it must be first or second line
+                        //
+                        out << "# encoding: utf-8\n";
                         printHeader(out);
                         printGeneratedHeader(out, base + ".ice", "#");
 
diff --git a/cpp/src/Slice/RubyUtil.cpp b/cpp/src/Slice/RubyUtil.cpp
index 38bbeb5da19..3639a53b185 100644
--- a/cpp/src/Slice/RubyUtil.cpp
+++ b/cpp/src/Slice/RubyUtil.cpp
@@ -1601,7 +1601,7 @@ Slice::Ruby::CodeVisitor::writeConstantValue(const TypePtr& type, const SyntaxTr
                     ++i;
                 }
 
-                _out << "\".force_encoding(\"utf-8\")";                                   // Closing "
+                _out << "\"";                                      // Closing "
                 break;
             }
 
diff --git a/cpp/src/slice2cpp/Gen.cpp b/cpp/src/slice2cpp/Gen.cpp
index d93c457bf9e..5df28c48df2 100644
--- a/cpp/src/slice2cpp/Gen.cpp
+++ b/cpp/src/slice2cpp/Gen.cpp
@@ -32,6 +32,45 @@ namespace
 {
 
 string
+u32CodePoint(unsigned int value)
+{
+    ostringstream s;
+    s << "\\U";
+    s << hex;
+    s.width(8);
+    s.fill('0');
+    s << value;
+    return s.str();
+}
+
+
+void
+writeU8Buffer(const vector<unsigned char>& u8buffer, ::IceUtilInternal::Output& out)
+{
+    vector<unsigned int> u32buffer;
+    IceUtilInternal::ConversionResult result = convertUTF8ToUTF32(u8buffer, u32buffer, IceUtil::lenientConversion);
+    switch(result)
+    {
+        case conversionOK:
+            break;
+        case sourceExhausted:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted");
+        case sourceIllegal:
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal");
+        default:
+        {
+            assert(0);
+            throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+        }
+    }
+
+    for(vector<unsigned int>::const_iterator c = u32buffer.begin(); c != u32buffer.end(); ++c)
+    {
+        out << u32CodePoint(*c);
+    }
+}
+
+string
 getDeprecateSymbol(const ContainedPtr& p1, const ContainedPtr& p2)
 {
     string deprecateMetadata, deprecateSymbol;
@@ -74,115 +113,173 @@ writeConstantValue(IceUtilInternal::Output& out, const TypePtr& type, const Synt
 
             if((useWstring & TypeContextUseWstring) || findMetaData(metaData) == "wstring")
             {
-                out << 'L';
-            }
-            out << "\"";                                    // Opening "
+                //
+                // Wide strings
+                //
+                vector<unsigned char> u8buffer;                  // Buffer to convert multibyte characters
 
-            for(size_t i = 0; i < value.size();)
-            {
-                if(charSet.find(value[i]) == charSet.end())
+                out << "L\"";
+                for(size_t i = 0; i < value.size();)
+                {
+                    if(charSet.find(value[i]) == charSet.end())
+                    {
+                        if(static_cast<unsigned char>(value[i]) < 128) // Single byte character
+                        {
+                            //
+                            // Print as unicode if not in basic source character set
+                            //
+                            out << u32CodePoint(static_cast<unsigned int>(value[i]));
+                        }
+                        else
+                        {
+                            u8buffer.push_back(value[i]);
+                        }
+                    }
+                    else
+                    {
+                        //
+                        // Write any pedding characters in the utf8 buffer
+                        //
+                        if(!u8buffer.empty())
+                        {
+                            writeU8Buffer(u8buffer, out);
+                            u8buffer.clear();
+                        }
+                        
+                        switch(value[i])
+                        {
+                            case '"':
+                            {
+                                out << "\\";
+                                break;
+                            }
+                        }
+                        
+                        out << value[i];                              // Print normally if in basic source character set
+                    }
+                    i++;
+                    
+                }
+                
+                //
+                // Write any pedding characters in the utf8 buffer
+                //
+                if(!u8buffer.empty())
                 {
-                    unsigned char uc = value[i];                  // char may be signed, so make it positive
-                    ostringstream s;
-                    s << "\\";                              // Print as octal if not in basic source character set
-                    s.width(3);
-                    s.fill('0');
-                    s << oct;
-                    s << static_cast<unsigned>(uc);
-                    out << s.str();
+                    writeU8Buffer(u8buffer, out);
+                    u8buffer.clear();
                 }
-                else
+                out << "\"";
+            }
+            else // narrow strings
+            {
+                out << "\"";                                    // Opening "
+
+                for(size_t i = 0; i < value.size();)
                 {
-                    switch(value[i])
+                    if(charSet.find(value[i]) == charSet.end())
+                    {
+                        unsigned char uc = value[i];                  // char may be signed, so make it positive
+                        ostringstream s;
+                        s << "\\";                                    // Print as octal if not in basic source character set
+                        s.width(3);
+                        s.fill('0');
+                        s << oct;
+                        s << static_cast<unsigned>(uc);
+                        out << s.str();
+                    }
+                    else
                     {
-                        case '\\':
+                        switch(value[i])
                         {
-                            string s = "\\";
-                            size_t j = i + 1;
-                            for(; j < value.size(); ++j)
+                            case '\\':
                             {
-                                if(value[j] != '\\')
+                                string s = "\\";
+                                size_t j = i + 1;
+                                for(; j < value.size(); ++j)
                                 {
-                                    break;
+                                    if(value[j] != '\\')
+                                    {
+                                        break;
+                                    }
+                                    s += "\\";
                                 }
-                                s += "\\";
-                            }
 
-                            //
-                            // An even number of slash \ will escape the backslash and
-                            // the codepoint will be interpreted as its charaters
-                            //
-                            // \\U00000041  - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1']
-                            // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A')
-                            //
-                            if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u'))
-                            {
                                 //
-                                // Convert codepoint to UTF8 bytes and write the escaped bytes
+                                // An even number of slash \ will escape the backslash and
+                                // the codepoint will be interpreted as its charaters
+                                //
+                                // \\U00000041  - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1']
+                                // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A')
                                 //
-                                out << s.substr(0, s.size() - 1);
+                                if(s.size() % 2 != 0 && (value[j] == 'U' || value[j] == 'u'))
+                                {
+                                    //
+                                    // Convert codepoint to UTF8 bytes and write the escaped bytes
+                                    //
+                                    out << s.substr(0, s.size() - 1);
 
-                                size_t sz = value[j] == 'U' ? 8 : 4;
-                                string codepoint = value.substr(j + 1, sz);
-                                assert(codepoint.size() ==  sz);
+                                    size_t sz = value[j] == 'U' ? 8 : 4;
+                                    string codepoint = value.substr(j + 1, sz);
+                                    assert(codepoint.size() ==  sz);
 
-                                IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16);
+                                    IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16);
 
 
-                                vector<unsigned int> u32buffer;
-                                u32buffer.push_back(static_cast<unsigned int>(v));
+                                    vector<unsigned int> u32buffer;
+                                    u32buffer.push_back(static_cast<unsigned int>(v));
 
-                                vector<unsigned char> u8buffer;
+                                    vector<unsigned char> u8buffer;
 
-                                IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion);
-                                switch(result)
-                                {
-                                    case conversionOK:
-                                        break;
-                                    case sourceExhausted:
-                                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted");
-                                    case sourceIllegal:
-                                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal");
-                                    default:
+                                    IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion);
+                                    switch(result)
                                     {
-                                        assert(0);
-                                        throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+                                        case conversionOK:
+                                            break;
+                                        case sourceExhausted:
+                                            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted");
+                                        case sourceIllegal:
+                                            throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal");
+                                        default:
+                                        {
+                                            assert(0);
+                                            throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+                                        }
                                     }
-                                }
 
-                                ostringstream s;
-                                for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q)
+                                    ostringstream s;
+                                    for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q)
+                                    {
+                                        s << "\\";
+                                        s.fill('0');
+                                        s.width(3);
+                                        s << oct;
+                                        s << static_cast<unsigned int>(*q);
+                                    }
+                                    out << s.str();
+
+                                    i = j + 1 + sz;
+                                }
+                                else
                                 {
-                                    s << "\\";
-                                    s.fill('0');
-                                    s.width(3);
-                                    s << oct;
-                                    s << static_cast<unsigned int>(*q);
+                                    out << s;
+                                    i = j;
                                 }
-                                out << s.str();
-
-                                i = j + 1 + sz;
+                                continue;
                             }
-                            else
+                            case '"':
                             {
-                                out << s;
-                                i = j;
+                                out << "\\";
+                                break;
                             }
-                            continue;
-                        }
-                        case '"':
-                        {
-                            out << "\\";
-                            break;
                         }
+                        
+                        out << value[i];                              // Print normally if in basic source character set
                     }
-                    
-                    out << value[i];                              // Print normally if in basic source character set
+                    ++i;
                 }
-                ++i;
+                out << "\"";                                    // Closing "
             }
-
-            out << "\"";                                    // Closing "
         }
         else if(bp && bp->kind() == Builtin::KindLong)
         {
diff --git a/cpp/src/slice2objc/Gen.cpp b/cpp/src/slice2objc/Gen.cpp
index 283efc935e3..1363779f8e9 100644
--- a/cpp/src/slice2objc/Gen.cpp
+++ b/cpp/src/slice2objc/Gen.cpp
@@ -17,6 +17,8 @@
 #include <direct.h>
 #endif
 #include <IceUtil/Iterator.h>
+#include <IceUtil/Unicode.h>
+#include <IceUtil/InputUtil.h>
 #include <IceUtil/UUID.h>
 #include <Slice/Checksum.h>
 #include <Slice/FileTracker.h>
@@ -1492,13 +1494,13 @@ Slice::Gen::TypesVisitor::writeConstantValue(IceUtilInternal::Output& out, const
 
         out << "@\"";                                      // Opening @"
 
-        for(string::const_iterator c = val.begin(); c != val.end(); ++c)
+        for(size_t i = 0; i < val.size();)
         {
-            if(charSet.find(*c) == charSet.end())
+            if(charSet.find(val[i]) == charSet.end())
             {
-                unsigned char uc = *c;                  // char may be signed, so make it positive
+                unsigned char uc = val[i];                  // char may be signed, so make it positive
                 ostringstream s;
-                s << "\\";                              // Print as octal if not in basic source character set
+                s << "\\";                                    // Print as octal if not in basic source character set
                 s.width(3);
                 s.fill('0');
                 s << oct;
@@ -1507,11 +1509,95 @@ Slice::Gen::TypesVisitor::writeConstantValue(IceUtilInternal::Output& out, const
             }
             else
             {
-                out << *c;                                // Print normally if in basic source character set
+                switch(val[i])
+                {
+                    case '\\':
+                    {
+                        string s = "\\";
+                        size_t j = i + 1;
+                        for(; j < val.size(); ++j)
+                        {
+                            if(val[j] != '\\')
+                            {
+                                break;
+                            }
+                            s += "\\";
+                        }
+
+                        //
+                        // An even number of slash \ will escape the backslash and
+                        // the codepoint will be interpreted as its charaters
+                        //
+                        // \\U00000041  - ['\\', 'U', '0', '0', '0', '0', '0', '0', '4', '1']
+                        // \\\U00000041 - ['\\', 'A'] (41 is the codepoint for 'A')
+                        //
+                        if(s.size() % 2 != 0 && (val[j] == 'U' || val[j] == 'u'))
+                        {
+                            //
+                            // Convert codepoint to UTF8 bytes and write the escaped bytes
+                            //
+                            out << s.substr(0, s.size() - 1);
+
+                            size_t sz = val[j] == 'U' ? 8 : 4;
+                            string codepoint = val.substr(j + 1, sz);
+                            assert(codepoint.size() ==  sz);
+
+                            IceUtil::Int64 v = IceUtilInternal::strToInt64(codepoint.c_str(), 0, 16);
+
+
+                            vector<unsigned int> u32buffer;
+                            u32buffer.push_back(static_cast<unsigned int>(v));
+
+                            vector<unsigned char> u8buffer;
+
+                            IceUtilInternal::ConversionResult result = convertUTF32ToUTF8(u32buffer, u8buffer, IceUtil::lenientConversion);
+                            switch(result)
+                            {
+                                case conversionOK:
+                                    break;
+                                case sourceExhausted:
+                                    throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source exhausted");
+                                case sourceIllegal:
+                                    throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "string source illegal");
+                                default:
+                                {
+                                    assert(0);
+                                    throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
+                                }
+                            }
+
+                            ostringstream s;
+                            for(vector<unsigned char>::const_iterator q = u8buffer.begin(); q != u8buffer.end(); ++q)
+                            {
+                                s << "\\";
+                                s.fill('0');
+                                s.width(3);
+                                s << oct;
+                                s << static_cast<unsigned int>(*q);
+                            }
+                            out << s.str();
+
+                            i = j + 1 + sz;
+                        }
+                        else
+                        {
+                            out << s;
+                            i = j;
+                        }
+                        continue;
+                    }
+                    case '"':
+                    {
+                        out << "\\";
+                        break;
+                    }
+                }
+                
+                out << val[i];                              // Print normally if in basic source character set
             }
+            ++i;
         }
-
-        out << "\"";                                      // Closing "
+        out << "\"";                                    // Closing "
     }
     else
     {