summaryrefslogtreecommitdiff
path: root/cpp/src
diff options
context:
space:
mode:
authorBernard Normier <bernard@zeroc.com>2016-06-04 16:18:18 -0400
committerBernard Normier <bernard@zeroc.com>2016-06-04 16:18:18 -0400
commita59bb01921429e8d6963d63c22b91c995d1c4631 (patch)
treee37190c02823e28edbd4a133dbf5b1e11f53cd0c /cpp/src
parentMore UTF tests (diff)
downloadice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.bz2
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.tar.xz
ice-a59bb01921429e8d6963d63c22b91c995d1c4631.zip
UnicodeWstringConverter performance improvement and cleanup
Diffstat (limited to 'cpp/src')
-rw-r--r--cpp/src/IceUtil/StringConverter.cpp123
-rw-r--r--cpp/src/IceUtil/Unicode.cpp187
2 files changed, 129 insertions, 181 deletions
diff --git a/cpp/src/IceUtil/StringConverter.cpp b/cpp/src/IceUtil/StringConverter.cpp
index 8b60c48d53f..cb15037ecfa 100644
--- a/cpp/src/IceUtil/StringConverter.cpp
+++ b/cpp/src/IceUtil/StringConverter.cpp
@@ -43,11 +43,7 @@ struct SelectCodeCvt;
template<>
struct SelectCodeCvt<2>
{
-#ifdef ICE_LITTLE_ENDIAN
- typedef std::codecvt_utf8_utf16<wchar_t, 0x10ffff, little_endian> Type;
-#else
typedef std::codecvt_utf8_utf16<wchar_t> Type;
-#endif
};
template<>
@@ -155,28 +151,35 @@ public:
virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
{
- if(sourceStart == sourceEnd)
+ const size_t sourceSize = sourceEnd - sourceStart;
+
+ if(sourceSize == 0)
{
target = L"";
}
else
{
- //
- // TODO: consider reimplementing without the wstring_convert helper
- // to improve performance
- // Note that wstring_convert is "stateful" and cannot be a shared data member
- //
- wstring_convert<CodeCvt> convert;
-
- try
- {
- target = convert.from_bytes(reinterpret_cast<const char*>(sourceStart),
- reinterpret_cast<const char*>(sourceEnd));
- }
- catch(const std::range_error& ex)
+ target.resize(sourceSize);
+ wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+ wchar_t* targetEnd = targetStart + sourceSize;
+ wchar_t* targetNext = targetStart;
+
+ const char* sourceNext = reinterpret_cast<const char*>(sourceStart);
+
+ mbstate_t state = mbstate_t();
+
+ codecvt_base::result result = _codecvt.in(state,
+ reinterpret_cast<const char*>(sourceStart),
+ reinterpret_cast<const char*>(sourceEnd),
+ sourceNext,
+ targetStart, targetEnd, targetNext);
+
+ if(result != codecvt_base::ok)
{
- throw IllegalConversionException(__FILE__, __LINE__, ex.what());
+ throw IllegalConversionException(__FILE__, __LINE__, "codecvt.in failure");
}
+
+ target.resize(targetNext - targetStart);
}
}
@@ -215,14 +218,12 @@ public:
targetStart = buffer.getMoreBytes(chunkSize, targetStart);
targetEnd = targetStart + chunkSize;
-
}
while(convertUTFWstringToUTF8(sourceStart, sourceEnd, targetStart, targetEnd) == false);
return targetStart;
}
-
virtual void fromUTF8(const Byte* sourceStart, const Byte* sourceEnd, wstring& target) const
{
if(sourceStart == sourceEnd)
@@ -290,67 +291,38 @@ getUnicodeWstringConverter()
return unicodeWstringConverter;
}
-
class UTF8BufferI : public UTF8Buffer
{
public:
- UTF8BufferI() :
- _buffer(0),
- _offset(0)
- {
- }
-
- ~UTF8BufferI()
- {
- free(_buffer);
- }
-
+ //
+ // Returns the first unused byte in the resized buffer
+ //
Byte* getMoreBytes(size_t howMany, Byte* firstUnused)
{
- if(_buffer == 0)
- {
- _buffer = static_cast<Byte*>(malloc(howMany));
- if(!_buffer)
- {
- throw std::bad_alloc();
- }
- }
- else
+ size_t bytesUsed = 0;
+ if(firstUnused != 0)
{
- assert(firstUnused != 0);
- _offset = firstUnused - _buffer;
- Byte* newBuffer = static_cast<Byte*>(realloc(_buffer, _offset + howMany));
- if(!newBuffer)
- {
- reset();
- throw std::bad_alloc();
- }
- else
- {
- _buffer = newBuffer;
- }
+ bytesUsed = firstUnused - reinterpret_cast<const Byte*>(_buffer.data());
}
- return _buffer + _offset;
- }
-
- Byte* getBuffer()
- {
- return _buffer;
+ if(_buffer.size() < howMany + bytesUsed)
+ {
+ _buffer.resize(bytesUsed + howMany);
+ }
+
+ return const_cast<Byte*>(reinterpret_cast<const Byte*>(_buffer.data())) + bytesUsed;
}
- void reset()
+ void swap(string& other, const Byte* tail)
{
- free(_buffer);
- _buffer = 0;
- _offset = 0;
+ assert(tail >= reinterpret_cast<const Byte*>(_buffer.data()));
+ _buffer.resize(tail - reinterpret_cast<const Byte*>(_buffer.data()));
+ other.swap(_buffer);
}
private:
-
- Byte* _buffer;
- size_t _offset;
+ string _buffer;
};
#ifdef _WIN32
@@ -516,8 +488,8 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
//
UTF8BufferI buffer;
Byte* last = wConverterWithDefault->toUTF8(v.data(), v.data() + v.size(), buffer);
- target = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
-
+ buffer.swap(target, last);
+
//
// If narrow string converter is present convert to the native narrow string encoding, otherwise
// native narrow string encoding is UTF8 and we are done.
@@ -534,8 +506,7 @@ IceUtil::wstringToString(const wstring& v, const StringConverterPtr& converter,
}
wstring
-IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
- const WstringConverterPtr& wConverter)
+IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter, const WstringConverterPtr& wConverter)
{
wstring target;
if(!v.empty())
@@ -549,7 +520,7 @@ IceUtil::stringToWstring(const string& v, const StringConverterPtr& converter,
{
UTF8BufferI buffer;
Byte* last = converter->toUTF8(v.data(), v.data() + v.size(), buffer);
- tmp = string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+ buffer.swap(tmp, last);
}
else
{
@@ -577,7 +548,9 @@ IceUtil::nativeToUTF8(const string& str, const IceUtil::StringConverterPtr& conv
}
UTF8BufferI buffer;
Byte* last = converter->toUTF8(str.data(), str.data() + str.size(), buffer);
- return string(reinterpret_cast<const char*>(buffer.getBuffer()), last - buffer.getBuffer());
+ string result;
+ buffer.swap(result, last);
+ return result;
}
string
@@ -620,11 +593,7 @@ IceUtilInternal::toUTF16(const vector<Byte>& source)
#ifdef ICE_HAS_CODECVT_UTF8
assert(sizeof(Char16T) == sizeof(unsigned short));
-#ifdef ICE_LITTLE_ENDIAN
- typedef wstring_convert<codecvt_utf8_utf16<Char16T, 0x10ffff, little_endian>, Char16T> Convert;
-#else
typedef wstring_convert<codecvt_utf8_utf16<Char16T>, Char16T> Convert;
-#endif
Convert convert;
diff --git a/cpp/src/IceUtil/Unicode.cpp b/cpp/src/IceUtil/Unicode.cpp
index 22ced7e61b2..4db36d29e9d 100644
--- a/cpp/src/IceUtil/Unicode.cpp
+++ b/cpp/src/IceUtil/Unicode.cpp
@@ -26,90 +26,80 @@ using namespace IceUtilInternal;
namespace
{
- //
- // Helper class, base never defined
- // Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
- //
- template<size_t wcharSize>
- struct WstringHelper
- {
- static ConversionResult toUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd);
+//
+// Helper class, base never defined
+// Usage: WstringHelper<sizeof(wchar_t)>::toUTF8 and fromUTF8.
+//
+template<size_t wcharSize> struct WstringHelper;
- static ConversionResult fromUTF8(
- const Byte*& sourceStart, const Byte* sourceEnd,
- wchar_t*& targetStart, wchar_t* targetEnd);
- };
- template<>
- struct WstringHelper<2>
- {
- static ConversionResult toUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd)
- {
- return ConvertUTF16toUTF8(
- reinterpret_cast<const UTF16**>(&sourceStart),
- reinterpret_cast<const UTF16*>(sourceEnd),
- &targetStart, targetEnd, lenientConversion);
- }
-
- static ConversionResult fromUTF8(
- const Byte*& sourceStart, const Byte* sourceEnd,
- wchar_t*& targetStart, wchar_t* targetEnd)
- {
- return ConvertUTF8toUTF16(
- &sourceStart, sourceEnd,
- reinterpret_cast<UTF16**>(&targetStart),
- reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
- }
- };
-
- template<>
- struct WstringHelper<4>
- {
- static ConversionResult toUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd)
- {
- return ConvertUTF32toUTF8(
- reinterpret_cast<const UTF32**>(&sourceStart),
- reinterpret_cast<const UTF32*>(sourceEnd),
- &targetStart, targetEnd, lenientConversion);
- }
-
- static ConversionResult fromUTF8(
- const Byte*& sourceStart, const Byte* sourceEnd,
- wchar_t*& targetStart, wchar_t* targetEnd)
- {
- return ConvertUTF8toUTF32(
- &sourceStart, sourceEnd,
- reinterpret_cast<UTF32**>(&targetStart),
- reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
- }
- };
-
- void
- checkResult(ConversionResult result)
+template<>
+struct WstringHelper<2>
+{
+ static ConversionResult toUTF8(
+ const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+ Byte*& targetStart, Byte* targetEnd)
+ {
+ return ConvertUTF16toUTF8(
+ reinterpret_cast<const UTF16**>(&sourceStart),
+ reinterpret_cast<const UTF16*>(sourceEnd),
+ &targetStart, targetEnd, lenientConversion);
+ }
+
+ static ConversionResult fromUTF8(
+ const Byte*& sourceStart, const Byte* sourceEnd,
+ wchar_t*& targetStart, wchar_t* targetEnd)
+ {
+ return ConvertUTF8toUTF16(
+ &sourceStart, sourceEnd,
+ reinterpret_cast<UTF16**>(&targetStart),
+ reinterpret_cast<UTF16*>(targetEnd), lenientConversion);
+ }
+};
+
+template<>
+struct WstringHelper<4>
+{
+ static ConversionResult toUTF8(
+ const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+ Byte*& targetStart, Byte* targetEnd)
+ {
+ return ConvertUTF32toUTF8(
+ reinterpret_cast<const UTF32**>(&sourceStart),
+ reinterpret_cast<const UTF32*>(sourceEnd),
+ &targetStart, targetEnd, lenientConversion);
+ }
+
+ static ConversionResult fromUTF8(
+ const Byte*& sourceStart, const Byte* sourceEnd,
+ wchar_t*& targetStart, wchar_t* targetEnd)
+ {
+ return ConvertUTF8toUTF32(
+ &sourceStart, sourceEnd,
+ reinterpret_cast<UTF32**>(&targetStart),
+ reinterpret_cast<UTF32*>(targetEnd), lenientConversion);
+ }
+};
+
+void checkResult(ConversionResult result)
+{
+ switch (result)
+ {
+ case conversionOK:
+ break;
+ case sourceExhausted:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
+ case sourceIllegal:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+ case targetExhausted:
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
+ default:
{
- switch (result)
- {
- case conversionOK:
- break;
- case sourceExhausted:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source exhausted");
- case sourceIllegal:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
- case targetExhausted:
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__, "source illegal");
- default:
- {
- assert(0);
- throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
- }
- }
+ assert(0);
+ throw IceUtil::IllegalConversionException(__FILE__, __LINE__);
}
+ }
+}
}
//
@@ -117,9 +107,8 @@ namespace
//
bool
-IceUtilInternal::convertUTFWstringToUTF8(
- const wchar_t*& sourceStart, const wchar_t* sourceEnd,
- Byte*& targetStart, Byte* targetEnd)
+IceUtilInternal::convertUTFWstringToUTF8(const wchar_t*& sourceStart, const wchar_t* sourceEnd,
+ Byte*& targetStart, Byte* targetEnd)
{
ConversionResult result = WstringHelper<sizeof(wchar_t)>::toUTF8(
sourceStart, sourceEnd, targetStart, targetEnd);
@@ -135,30 +124,20 @@ IceUtilInternal::convertUTFWstringToUTF8(
}
}
-
void
-IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd,
- std::wstring& target)
+IceUtilInternal::convertUTF8ToUTFWstring(const Byte*& sourceStart, const Byte* sourceEnd, std::wstring& target)
{
- //
- // Could be reimplemented without this temporary wchar_t buffer
- //
- size_t size = static_cast<size_t>(sourceEnd - sourceStart);
- wchar_t* outBuf = new wchar_t[size];
- wchar_t* targetStart = outBuf;
- wchar_t* targetEnd = targetStart + size;
-
- ConversionResult result =
- WstringHelper<sizeof(wchar_t)>::fromUTF8(
- sourceStart, sourceEnd, targetStart, targetEnd);
-
- if(result == conversionOK)
- {
- std::wstring s(outBuf, static_cast<size_t>(targetStart - outBuf));
- s.swap(target);
- }
- delete[] outBuf;
+ size_t sourceSize = static_cast<size_t>(sourceEnd - sourceStart);
+
+ target.resize(sourceSize);
+ wchar_t* targetStart = const_cast<wchar_t*>(target.data());
+ wchar_t* targetEnd = targetStart + sourceSize;
+
+ ConversionResult result = WstringHelper<sizeof(wchar_t)>::fromUTF8(sourceStart, sourceEnd,
+ targetStart, targetEnd);
+
checkResult(result);
+ target.resize(targetStart - target.data());
}
void