Refactor utf8_to_wide/wide_to_utf8 functions

2025-03-14 06:12:36 +01:00 · 2021-01-29 13:09:17 +01:00
parent 7ebd5da9cd
commit 5e392cf34f
3 changed files with 40 additions and 38 deletions
--- a/src/unittest/test_utilities.cpp
+++ b/src/unittest/test_utilities.cpp
@ -302,9 +302,18 @@ void TestUtilities::testAsciiPrintableHelper()

 void TestUtilities::testUTF8()
 {
-	UASSERT(wide_to_utf8(utf8_to_wide("")) == "");
-	UASSERT(wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!"))
-		== "the shovel dug a crumbly node!");
+	UASSERT(utf8_to_wide("¤") == L"¤");
+
+	UASSERT(wide_to_utf8(L"¤") == "¤");
+
+	UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("")), "");
+	UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")),
+		"the shovel dug a crumbly node!");
+	UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-ä-")),
+		"-ä-");
+	UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-\xF0\xA0\x80\x8B-")),
+		"-\xF0\xA0\x80\x8B-");
+
 }

 void TestUtilities::testRemoveEscapes()
--- a/src/util/string.cpp
+++ b/src/util/string.cpp
@ -50,8 +50,8 @@ static bool parseNamedColorString(const std::string &value, video::SColor &color

 #ifndef _WIN32

-bool convert(const char *to, const char *from, char *outbuf,
-		size_t outbuf_size, char *inbuf, size_t inbuf_size)
+static bool convert(const char *to, const char *from, char *outbuf,
+		size_t *outbuf_size, char *inbuf, size_t inbuf_size)
 {
 	iconv_t cd = iconv_open(to, from);

@ -60,15 +60,14 @@ bool convert(const char *to, const char *from, char *outbuf,
 #else
 	char *inbuf_ptr = inbuf;
 #endif
-
 	char *outbuf_ptr = outbuf;

 	size_t *inbuf_left_ptr = &inbuf_size;
-	size_t *outbuf_left_ptr = &outbuf_size;

+	const size_t old_outbuf_size = *outbuf_size;
 	size_t old_size = inbuf_size;
 	while (inbuf_size > 0) {
-		iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_left_ptr);
+		iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_size);
 		if (inbuf_size == old_size) {
 			iconv_close(cd);
 			return false;
@ -77,11 +76,12 @@ bool convert(const char *to, const char *from, char *outbuf,
 	}

 	iconv_close(cd);
+	*outbuf_size = old_outbuf_size - *outbuf_size;
 	return true;
 }

 #ifdef __ANDROID__
-// Android need manual caring to support the full character set possible with wchar_t
+// On Android iconv disagrees how big a wchar_t is for whatever reason
 const char *DEFAULT_ENCODING = "UTF-32LE";
 #else
 const char *DEFAULT_ENCODING = "WCHAR_T";
@ -89,58 +89,52 @@ const char *DEFAULT_ENCODING = "WCHAR_T";

 std::wstring utf8_to_wide(const std::string &input)
 {
-	size_t inbuf_size = input.length() + 1;
+	const size_t inbuf_size = input.length();
 	// maximum possible size, every character is sizeof(wchar_t) bytes
-	size_t outbuf_size = (input.length() + 1) * sizeof(wchar_t);
+	size_t outbuf_size = input.length() * sizeof(wchar_t);

-	char *inbuf = new char[inbuf_size];
+	char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated
 	memcpy(inbuf, input.c_str(), inbuf_size);
-	char *outbuf = new char[outbuf_size];
-	memset(outbuf, 0, outbuf_size);
+	std::wstring out;
+	out.resize(outbuf_size / sizeof(wchar_t));

 #ifdef __ANDROID__
-	// Android need manual caring to support the full character set possible with wchar_t
 	SANITY_CHECK(sizeof(wchar_t) == 4);
 #endif

-	if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, outbuf_size, inbuf, inbuf_size)) {
+	char *outbuf = reinterpret_cast<char*>(&out[0]);
+	if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, &outbuf_size, inbuf, inbuf_size)) {
 		infostream << "Couldn't convert UTF-8 string 0x" << hex_encode(input)
 			<< " into wstring" << std::endl;
 		delete[] inbuf;
-		delete[] outbuf;
 		return L"<invalid UTF-8 string>";
 	}
-	std::wstring out((wchar_t *)outbuf);
-
 	delete[] inbuf;
-	delete[] outbuf;

+	out.resize(outbuf_size / sizeof(wchar_t));
 	return out;
 }

 std::string wide_to_utf8(const std::wstring &input)
 {
-	size_t inbuf_size = (input.length() + 1) * sizeof(wchar_t);
-	// maximum possible size: utf-8 encodes codepoints using 1 up to 6 bytes
-	size_t outbuf_size = (input.length() + 1) * 6;
+	const size_t inbuf_size = input.length() * sizeof(wchar_t);
+	// maximum possible size: utf-8 encodes codepoints using 1 up to 4 bytes
+	size_t outbuf_size = input.length() * 4;

-	char *inbuf = new char[inbuf_size];
+	char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated
 	memcpy(inbuf, input.c_str(), inbuf_size);
-	char *outbuf = new char[outbuf_size];
-	memset(outbuf, 0, outbuf_size);
+	std::string out;
+	out.resize(outbuf_size);

-	if (!convert("UTF-8", DEFAULT_ENCODING, outbuf, outbuf_size, inbuf, inbuf_size)) {
+	if (!convert("UTF-8", DEFAULT_ENCODING, &out[0], &outbuf_size, inbuf, inbuf_size)) {
 		infostream << "Couldn't convert wstring 0x" << hex_encode(inbuf, inbuf_size)
 			<< " into UTF-8 string" << std::endl;
 		delete[] inbuf;
-		delete[] outbuf;
-		return "<invalid wstring>";
+		return "<invalid wide string>";
 	}
-	std::string out(outbuf);
-
 	delete[] inbuf;
-	delete[] outbuf;

+	out.resize(outbuf_size);
 	return out;
 }

@ -172,15 +166,12 @@ std::string wide_to_utf8(const std::wstring &input)

 #endif // _WIN32

-// You must free the returned string!
-// The returned string is allocated using new
 wchar_t *utf8_to_wide_c(const char *str)
 {
 	std::wstring ret = utf8_to_wide(std::string(str));
 	size_t len = ret.length();
 	wchar_t *ret_c = new wchar_t[len + 1];
-	memset(ret_c, 0, (len + 1) * sizeof(wchar_t));
-	memcpy(ret_c, ret.c_str(), len * sizeof(wchar_t));
+	memcpy(ret_c, ret.c_str(), (len + 1) * sizeof(wchar_t));
 	return ret_c;
 }

--- a/src/util/string.h
+++ b/src/util/string.h
@ -64,11 +64,13 @@ struct FlagDesc {
 	u32 flag;
 };

-// try not to convert between wide/utf8 encodings; this can result in data loss
-// try to only convert between them when you need to input/output stuff via Irrlicht
+// Try to avoid converting between wide and UTF-8 unless you need to
+// input/output stuff via Irrlicht
 std::wstring utf8_to_wide(const std::string &input);
 std::string wide_to_utf8(const std::wstring &input);

+// You must free the returned string!
+// The returned string is allocated using new[]
 wchar_t *utf8_to_wide_c(const char *str);

 // NEVER use those two functions unless you have a VERY GOOD reason to