Fix character encoding conversion issues

2023-10-05 17:55:55 +02:00 · 2023-10-05 17:55:55 +02:00 · c766c3a023
commit c766c3a023
parent 93eebed8c9
6 changed files with 60 additions and 79 deletions
--- a/examples/AutomatedTest/test_string.cpp
+++ b/examples/AutomatedTest/test_string.cpp
@ -168,22 +168,32 @@ static void test_methods()

 static void test_conv()
 {
-	// assumes Unicode and UTF-8 locale
-	setlocale(LC_CTYPE, "");
+	// locale-independent

 	stringw out;
-	multibyteToWString(out, "†††");
+	utf8ToWString(out, "†††");
 	UASSERTEQ(out.size(), 3);
 	for (int i = 0; i < 3; i++)
 		UASSERTEQ(static_cast<u16>(out[i]), 0x2020);
+
 	stringc out2;
-	wStringToMultibyte(out2, L"†††");
+	wStringToUTF8(out2, L"†††");
 	UASSERTEQ(out2.size(), 9);
 	for (int i = 0; i < 3; i++) {
 		UASSERTEQ(static_cast<u8>(out2[3*i]), 0xe2);
 		UASSERTEQ(static_cast<u8>(out2[3*i+1]), 0x80);
 		UASSERTEQ(static_cast<u8>(out2[3*i+2]), 0xa0);
 	}
+
+	// locale-dependent
+	if (!setlocale(LC_CTYPE, "C.UTF-8"))
+		setlocale(LC_CTYPE, "UTF-8"); // macOS
+
+	stringw out3;
+	multibyteToWString(out3, "†††");
+	UASSERTEQ(out3.size(), 3);
+	for (int i = 0; i < 3; i++)
+		UASSERTEQ(static_cast<u16>(out3[i]), 0x2020);
 }

 void test_irr_string()
--- a/include/irrString.h
+++ b/include/irrString.h
@ -11,6 +11,8 @@
 #include <cstdio>
 #include <cstring>
 #include <cwchar>
+#include <codecvt>
+#include <locale>

 namespace irr
 {
@ -35,8 +37,13 @@ outside the string class for explicit use.
 // forward declarations
 template <typename T>
 class string;
-static size_t multibyteToWString(string<wchar_t>& destination, const char* source, u32 sourceSize);
-static size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source, u32 sourceSize);
+
+//! Typedef for character strings
+typedef string<c8> stringc;
+
+//! Typedef for wide character strings
+typedef string<wchar_t> stringw;
+

 //! Returns a character converted to lower case
 static inline u32 locale_lower ( u32 x )
@ -859,8 +866,11 @@ public:
 		return ret.size()-oldSize;
 	}

-	friend size_t multibyteToWString(string<wchar_t>& destination, const char* source, u32 sourceSize);
-	friend size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source, u32 sourceSize);
+	// This function should not be used and is only kept for "CGUIFileOpenDialog::pathToStringW".
+	friend size_t multibyteToWString(stringw& destination, const stringc &source);
+
+	friend size_t utf8ToWString(stringw &destination, const char *source);
+	friend size_t wStringToUTF8(stringc &destination, const wchar_t *source);

 private:

@ -913,38 +923,18 @@ private:
 };


-//! Typedef for character strings
-typedef string<c8> stringc;
-
-//! Typedef for wide character strings
-typedef string<wchar_t> stringw;
-
 //! Convert multibyte string to wide-character string
 /** Wrapper around mbstowcs from standard library, but directly using Irrlicht string class.
 What the function does exactly depends on the LC_CTYPE of the current c locale.
 \param destination Wide-character string receiving the converted source
 \param source multibyte string
-\return The number of wide characters written to destination, not including the eventual terminating null character or -1 when conversion failed */
-static inline size_t multibyteToWString(string<wchar_t>& destination, const core::string<c8>& source)
-{
-	return multibyteToWString(destination, source.c_str(), (u32)source.size());
-}
+\return The number of wide characters written to destination, not including the eventual terminating null character or -1 when conversion failed

-//! Convert multibyte string to wide-character string
-/** Wrapper around mbstowcs from standard library, but directly writing to Irrlicht string class.
-What the function does exactly depends on the LC_CTYPE of the current c locale.
-\param destination Wide-character string receiving the converted source
-\param source multibyte string
-\return The number of wide characters written to destination, not including the eventual terminating null character  or -1 when conversion failed. */
-static inline size_t multibyteToWString(string<wchar_t>& destination, const char* source)
+This function should not be used and is only kept for "CGUIFileOpenDialog::pathToStringW". */
+inline size_t multibyteToWString(stringw& destination, const core::stringc& source)
 {
-	const u32 s = source ? (u32)strlen(source) : 0;
-	return multibyteToWString(destination, source, s);
-}
+	u32 sourceSize = source.size();

-//! Internally used by the other multibyteToWString functions
-static size_t multibyteToWString(string<wchar_t>& destination, const char* source, u32 sourceSize)
-{
 	if ( sourceSize )
 	{
 		destination.str.resize(sourceSize+1);
@ -952,7 +942,7 @@ static size_t multibyteToWString(string<wchar_t>& destination, const char* sourc
 #pragma warning(push)
 #pragma warning(disable: 4996)	// 'mbstowcs': This function or variable may be unsafe. Consider using mbstowcs_s instead.
 #endif
-		const size_t written = mbstowcs(&destination[0], source, (size_t)sourceSize);
+		const size_t written = mbstowcs(&destination[0], source.c_str(), (size_t)sourceSize);
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@ -975,50 +965,29 @@ static size_t multibyteToWString(string<wchar_t>& destination, const char* sourc
 	}
 }

-//! Same as multibyteToWString, but the other way around
-static inline size_t wStringToMultibyte(string<c8>& destination, const core::string<wchar_t>& source)
+
+inline size_t utf8ToWString(stringw &destination, const char *source)
 {
-	return wStringToMultibyte(destination, source.c_str(), (u32)source.size());
+	std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+	destination = conv.from_bytes(source);
+	return destination.size();
 }

-//! Same as multibyteToWString, but the other way around
-static inline size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source)
+inline size_t utf8ToWString(stringw &destination, const stringc &source)
 {
-	const u32 s = source ? (u32)wcslen(source) : 0;
-	return wStringToMultibyte(destination, source, s);
+	return utf8ToWString(destination, source.c_str());
 }

-//! Same as multibyteToWString, but the other way around
-static size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source, u32 sourceSize)
+inline size_t wStringToUTF8(stringc &destination, const wchar_t *source)
 {
-	if ( sourceSize )
-	{
-		destination.str.resize(sizeof(wchar_t)*sourceSize+1);
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable: 4996)	// 'wcstombs': This function or variable may be unsafe. Consider using wcstombs_s instead.
-#endif
-		const size_t written = wcstombs(&destination[0], source, destination.size());
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-		if ( written != (size_t)-1 )
-		{
-			destination.str.resize(written);
-		}
-		else
-		{
-			// Likely character which got converted until the invalid character was encountered are in destination now.
-			// And it seems even 0-terminated, but I found no documentation anywhere that this (the 0-termination) is guaranteed :-(
-			destination.clear();
-		}
-		return written;
-	}
-	else
-	{
-		destination.clear();
-		return 0;
-	}
+	std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+	destination = conv.to_bytes(source);
+	return destination.size();
+}
+
+inline size_t wStringToUTF8(stringc &destination, const stringw &source)
+{
+	return wStringToUTF8(destination, source.c_str());
 }


--- a/source/Irrlicht/CGUIEditBox.cpp
+++ b/source/Irrlicht/CGUIEditBox.cpp
@ -299,7 +299,7 @@ bool CGUIEditBox::processKey(const SEvent& event)
 				const s32 realmend = MarkBegin < MarkEnd ? MarkEnd : MarkBegin;

 				core::stringc s;
-				wStringToMultibyte(s, Text.subString(realmbgn, realmend - realmbgn));
+				wStringToUTF8(s, Text.subString(realmbgn, realmend - realmbgn));
 				Operator->copyToClipboard(s.c_str());
 			}
 			break;
@ -312,7 +312,7 @@ bool CGUIEditBox::processKey(const SEvent& event)

 				// copy
 				core::stringc sc;
-				wStringToMultibyte(sc, Text.subString(realmbgn, realmend - realmbgn));
+				wStringToUTF8(sc, Text.subString(realmbgn, realmend - realmbgn));
 				Operator->copyToClipboard(sc.c_str());

 				if (isEnabled())
@ -345,7 +345,7 @@ bool CGUIEditBox::processKey(const SEvent& event)
 				if (p)
 				{
 					irr::core::stringw widep;
-					core::multibyteToWString(widep, p);
+					core::utf8ToWString(widep, p);

 					if (MarkBegin == MarkEnd)
 					{
@ -1157,7 +1157,7 @@ bool CGUIEditBox::processMouse(const SEvent& event)
 			const c8 *inserted_text_utf8 = Operator->getTextFromPrimarySelection();
 			if (!inserted_text_utf8)
 				return inserted_text;
-			core::multibyteToWString(inserted_text, inserted_text_utf8);
+			core::utf8ToWString(inserted_text, inserted_text_utf8);
 			return inserted_text;
 		}());

@ -1659,7 +1659,7 @@ void CGUIEditBox::setTextMarkers(s32 begin, s32 end)
 			const s32 realmend = MarkBegin < MarkEnd ? MarkEnd : MarkBegin;

 			core::stringc s;
-			wStringToMultibyte(s, Text.subString(realmbgn, realmend - realmbgn));
+			wStringToUTF8(s, Text.subString(realmbgn, realmend - realmbgn));
 			Operator->copyToPrimarySelection(s.c_str());
 		}

--- a/source/Irrlicht/CIrrDeviceSDL.cpp
+++ b/source/Irrlicht/CIrrDeviceSDL.cpp
@ -673,7 +673,7 @@ bool CIrrDeviceSDL::run()
 			{
 				irrevent.EventType = irr::EET_STRING_INPUT_EVENT;
 				irrevent.StringInput.Str = new core::stringw();
-				irr::core::multibyteToWString(*irrevent.StringInput.Str, SDL_event.text.text);
+				irr::core::utf8ToWString(*irrevent.StringInput.Str, SDL_event.text.text);
 				postEventFromUser(irrevent);
 				delete irrevent.StringInput.Str;
 				irrevent.StringInput.Str = NULL;
@ -928,7 +928,7 @@ void CIrrDeviceSDL::sleep(u32 timeMs, bool pauseTimer)
 void CIrrDeviceSDL::setWindowCaption(const wchar_t* text)
 {
 	core::stringc textc;
-	core::wStringToMultibyte(textc, text);
+	core::wStringToUTF8(textc, text);
 	SDL_SetWindowTitle(Window, textc.c_str());
 }

--- a/source/Irrlicht/CMakeLists.txt
+++ b/source/Irrlicht/CMakeLists.txt
@ -50,6 +50,8 @@ elseif(MSVC)
 	if(CMAKE_SIZEOF_VOID_P EQUAL 4)
 		add_compile_options(/arch:SSE)
 	endif()
+
+	add_compile_options(/D_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
 endif()

 # Sanity-check version
--- a/source/Irrlicht/COSOperator.cpp
+++ b/source/Irrlicht/COSOperator.cpp
@ -94,7 +94,7 @@ void COSOperator::copyToClipboard(const c8 *text) const
 	EmptyClipboard();

 	core::stringw tempbuffer;
-	core::multibyteToWString(tempbuffer, text);
+	core::utf8ToWString(tempbuffer, text);
 	const u32 size = (tempbuffer.size() + 1) * sizeof(wchar_t);

 	HGLOBAL clipbuffer;
@ -164,7 +164,7 @@ const c8* COSOperator::getTextFromClipboard() const
 	HANDLE hData = GetClipboardData( CF_UNICODETEXT );
 	buffer = (wchar_t*) GlobalLock( hData );

-	core::wStringToMultibyte(ClipboardBuf, buffer);
+	core::wStringToUTF8(ClipboardBuf, buffer);

 	GlobalUnlock( hData );
 	CloseClipboard();