From 5e392cf34f8e062dd0533619921223656e32598a Mon Sep 17 00:00:00 2001 From: sfan5 Date: Fri, 29 Jan 2021 13:09:17 +0100 Subject: [PATCH] Refactor utf8_to_wide/wide_to_utf8 functions --- src/unittest/test_utilities.cpp | 15 +++++++-- src/util/string.cpp | 57 ++++++++++++++------------------- src/util/string.h | 6 ++-- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/src/unittest/test_utilities.cpp b/src/unittest/test_utilities.cpp index 447b591e1..5559cdbf2 100644 --- a/src/unittest/test_utilities.cpp +++ b/src/unittest/test_utilities.cpp @@ -302,9 +302,18 @@ void TestUtilities::testAsciiPrintableHelper() void TestUtilities::testUTF8() { - UASSERT(wide_to_utf8(utf8_to_wide("")) == ""); - UASSERT(wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")) - == "the shovel dug a crumbly node!"); + UASSERT(utf8_to_wide("¤") == L"¤"); + + UASSERT(wide_to_utf8(L"¤") == "¤"); + + UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("")), ""); + UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")), + "the shovel dug a crumbly node!"); + UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-ä-")), + "-ä-"); + UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-\xF0\xA0\x80\x8B-")), + "-\xF0\xA0\x80\x8B-"); + } void TestUtilities::testRemoveEscapes() diff --git a/src/util/string.cpp b/src/util/string.cpp index 3ac3b8cf0..7e6d6d3b3 100644 --- a/src/util/string.cpp +++ b/src/util/string.cpp @@ -50,8 +50,8 @@ static bool parseNamedColorString(const std::string &value, video::SColor &color #ifndef _WIN32 -bool convert(const char *to, const char *from, char *outbuf, - size_t outbuf_size, char *inbuf, size_t inbuf_size) +static bool convert(const char *to, const char *from, char *outbuf, + size_t *outbuf_size, char *inbuf, size_t inbuf_size) { iconv_t cd = iconv_open(to, from); @@ -60,15 +60,14 @@ bool convert(const char *to, const char *from, char *outbuf, #else char *inbuf_ptr = inbuf; #endif - char *outbuf_ptr = outbuf; size_t *inbuf_left_ptr = &inbuf_size; - size_t *outbuf_left_ptr = &outbuf_size; + const size_t old_outbuf_size = *outbuf_size; size_t old_size = inbuf_size; while (inbuf_size > 0) { - iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_left_ptr); + iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_size); if (inbuf_size == old_size) { iconv_close(cd); return false; @@ -77,11 +76,12 @@ bool convert(const char *to, const char *from, char *outbuf, } iconv_close(cd); + *outbuf_size = old_outbuf_size - *outbuf_size; return true; } #ifdef __ANDROID__ -// Android need manual caring to support the full character set possible with wchar_t +// On Android iconv disagrees how big a wchar_t is for whatever reason const char *DEFAULT_ENCODING = "UTF-32LE"; #else const char *DEFAULT_ENCODING = "WCHAR_T"; @@ -89,58 +89,52 @@ const char *DEFAULT_ENCODING = "WCHAR_T"; std::wstring utf8_to_wide(const std::string &input) { - size_t inbuf_size = input.length() + 1; + const size_t inbuf_size = input.length(); // maximum possible size, every character is sizeof(wchar_t) bytes - size_t outbuf_size = (input.length() + 1) * sizeof(wchar_t); + size_t outbuf_size = input.length() * sizeof(wchar_t); - char *inbuf = new char[inbuf_size]; + char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated memcpy(inbuf, input.c_str(), inbuf_size); - char *outbuf = new char[outbuf_size]; - memset(outbuf, 0, outbuf_size); + std::wstring out; + out.resize(outbuf_size / sizeof(wchar_t)); #ifdef __ANDROID__ - // Android need manual caring to support the full character set possible with wchar_t SANITY_CHECK(sizeof(wchar_t) == 4); #endif - if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, outbuf_size, inbuf, inbuf_size)) { + char *outbuf = reinterpret_cast(&out[0]); + if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, &outbuf_size, inbuf, inbuf_size)) { infostream << "Couldn't convert UTF-8 string 0x" << hex_encode(input) << " into wstring" << std::endl; delete[] inbuf; - delete[] outbuf; return L""; } - std::wstring out((wchar_t *)outbuf); - delete[] inbuf; - delete[] outbuf; + out.resize(outbuf_size / sizeof(wchar_t)); return out; } std::string wide_to_utf8(const std::wstring &input) { - size_t inbuf_size = (input.length() + 1) * sizeof(wchar_t); - // maximum possible size: utf-8 encodes codepoints using 1 up to 6 bytes - size_t outbuf_size = (input.length() + 1) * 6; + const size_t inbuf_size = input.length() * sizeof(wchar_t); + // maximum possible size: utf-8 encodes codepoints using 1 up to 4 bytes + size_t outbuf_size = input.length() * 4; - char *inbuf = new char[inbuf_size]; + char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated memcpy(inbuf, input.c_str(), inbuf_size); - char *outbuf = new char[outbuf_size]; - memset(outbuf, 0, outbuf_size); + std::string out; + out.resize(outbuf_size); - if (!convert("UTF-8", DEFAULT_ENCODING, outbuf, outbuf_size, inbuf, inbuf_size)) { + if (!convert("UTF-8", DEFAULT_ENCODING, &out[0], &outbuf_size, inbuf, inbuf_size)) { infostream << "Couldn't convert wstring 0x" << hex_encode(inbuf, inbuf_size) << " into UTF-8 string" << std::endl; delete[] inbuf; - delete[] outbuf; - return ""; + return ""; } - std::string out(outbuf); - delete[] inbuf; - delete[] outbuf; + out.resize(outbuf_size); return out; } @@ -172,15 +166,12 @@ std::string wide_to_utf8(const std::wstring &input) #endif // _WIN32 -// You must free the returned string! -// The returned string is allocated using new wchar_t *utf8_to_wide_c(const char *str) { std::wstring ret = utf8_to_wide(std::string(str)); size_t len = ret.length(); wchar_t *ret_c = new wchar_t[len + 1]; - memset(ret_c, 0, (len + 1) * sizeof(wchar_t)); - memcpy(ret_c, ret.c_str(), len * sizeof(wchar_t)); + memcpy(ret_c, ret.c_str(), (len + 1) * sizeof(wchar_t)); return ret_c; } diff --git a/src/util/string.h b/src/util/string.h index 6fd11fadc..ec14e9a2d 100644 --- a/src/util/string.h +++ b/src/util/string.h @@ -64,11 +64,13 @@ struct FlagDesc { u32 flag; }; -// try not to convert between wide/utf8 encodings; this can result in data loss -// try to only convert between them when you need to input/output stuff via Irrlicht +// Try to avoid converting between wide and UTF-8 unless you need to +// input/output stuff via Irrlicht std::wstring utf8_to_wide(const std::string &input); std::string wide_to_utf8(const std::wstring &input); +// You must free the returned string! +// The returned string is allocated using new[] wchar_t *utf8_to_wide_c(const char *str); // NEVER use those two functions unless you have a VERY GOOD reason to