Fix character encoding conversion issues

This commit is contained in:
Gregor Parzefall 2023-10-05 17:55:55 +02:00 committed by sfan5
parent 93eebed8c9
commit c766c3a023
6 changed files with 60 additions and 79 deletions

@ -168,22 +168,32 @@ static void test_methods()
static void test_conv() static void test_conv()
{ {
// assumes Unicode and UTF-8 locale // locale-independent
setlocale(LC_CTYPE, "");
stringw out; stringw out;
multibyteToWString(out, "†††"); utf8ToWString(out, "†††");
UASSERTEQ(out.size(), 3); UASSERTEQ(out.size(), 3);
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
UASSERTEQ(static_cast<u16>(out[i]), 0x2020); UASSERTEQ(static_cast<u16>(out[i]), 0x2020);
stringc out2; stringc out2;
wStringToMultibyte(out2, L"†††"); wStringToUTF8(out2, L"†††");
UASSERTEQ(out2.size(), 9); UASSERTEQ(out2.size(), 9);
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
UASSERTEQ(static_cast<u8>(out2[3*i]), 0xe2); UASSERTEQ(static_cast<u8>(out2[3*i]), 0xe2);
UASSERTEQ(static_cast<u8>(out2[3*i+1]), 0x80); UASSERTEQ(static_cast<u8>(out2[3*i+1]), 0x80);
UASSERTEQ(static_cast<u8>(out2[3*i+2]), 0xa0); UASSERTEQ(static_cast<u8>(out2[3*i+2]), 0xa0);
} }
// locale-dependent
if (!setlocale(LC_CTYPE, "C.UTF-8"))
setlocale(LC_CTYPE, "UTF-8"); // macOS
stringw out3;
multibyteToWString(out3, "†††");
UASSERTEQ(out3.size(), 3);
for (int i = 0; i < 3; i++)
UASSERTEQ(static_cast<u16>(out3[i]), 0x2020);
} }
void test_irr_string() void test_irr_string()

@ -11,6 +11,8 @@
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <cwchar> #include <cwchar>
#include <codecvt>
#include <locale>
namespace irr namespace irr
{ {
@ -35,8 +37,13 @@ outside the string class for explicit use.
// forward declarations // forward declarations
template <typename T> template <typename T>
class string; class string;
static size_t multibyteToWString(string<wchar_t>& destination, const char* source, u32 sourceSize);
static size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source, u32 sourceSize); //! Typedef for character strings
typedef string<c8> stringc;
//! Typedef for wide character strings
typedef string<wchar_t> stringw;
//! Returns a character converted to lower case //! Returns a character converted to lower case
static inline u32 locale_lower ( u32 x ) static inline u32 locale_lower ( u32 x )
@ -859,8 +866,11 @@ public:
return ret.size()-oldSize; return ret.size()-oldSize;
} }
friend size_t multibyteToWString(string<wchar_t>& destination, const char* source, u32 sourceSize); // This function should not be used and is only kept for "CGUIFileOpenDialog::pathToStringW".
friend size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source, u32 sourceSize); friend size_t multibyteToWString(stringw& destination, const stringc &source);
friend size_t utf8ToWString(stringw &destination, const char *source);
friend size_t wStringToUTF8(stringc &destination, const wchar_t *source);
private: private:
@ -913,38 +923,18 @@ private:
}; };
//! Typedef for character strings
typedef string<c8> stringc;
//! Typedef for wide character strings
typedef string<wchar_t> stringw;
//! Convert multibyte string to wide-character string //! Convert multibyte string to wide-character string
/** Wrapper around mbstowcs from standard library, but directly using Irrlicht string class. /** Wrapper around mbstowcs from standard library, but directly using Irrlicht string class.
What the function does exactly depends on the LC_CTYPE of the current c locale. What the function does exactly depends on the LC_CTYPE of the current c locale.
\param destination Wide-character string receiving the converted source \param destination Wide-character string receiving the converted source
\param source multibyte string \param source multibyte string
\return The number of wide characters written to destination, not including the eventual terminating null character or -1 when conversion failed */ \return The number of wide characters written to destination, not including the eventual terminating null character or -1 when conversion failed
static inline size_t multibyteToWString(string<wchar_t>& destination, const core::string<c8>& source)
{
return multibyteToWString(destination, source.c_str(), (u32)source.size());
}
//! Convert multibyte string to wide-character string This function should not be used and is only kept for "CGUIFileOpenDialog::pathToStringW". */
/** Wrapper around mbstowcs from standard library, but directly writing to Irrlicht string class. inline size_t multibyteToWString(stringw& destination, const core::stringc& source)
What the function does exactly depends on the LC_CTYPE of the current c locale.
\param destination Wide-character string receiving the converted source
\param source multibyte string
\return The number of wide characters written to destination, not including the eventual terminating null character or -1 when conversion failed. */
static inline size_t multibyteToWString(string<wchar_t>& destination, const char* source)
{ {
const u32 s = source ? (u32)strlen(source) : 0; u32 sourceSize = source.size();
return multibyteToWString(destination, source, s);
}
//! Internally used by the other multibyteToWString functions
static size_t multibyteToWString(string<wchar_t>& destination, const char* source, u32 sourceSize)
{
if ( sourceSize ) if ( sourceSize )
{ {
destination.str.resize(sourceSize+1); destination.str.resize(sourceSize+1);
@ -952,7 +942,7 @@ static size_t multibyteToWString(string<wchar_t>& destination, const char* sourc
#pragma warning(push) #pragma warning(push)
#pragma warning(disable: 4996) // 'mbstowcs': This function or variable may be unsafe. Consider using mbstowcs_s instead. #pragma warning(disable: 4996) // 'mbstowcs': This function or variable may be unsafe. Consider using mbstowcs_s instead.
#endif #endif
const size_t written = mbstowcs(&destination[0], source, (size_t)sourceSize); const size_t written = mbstowcs(&destination[0], source.c_str(), (size_t)sourceSize);
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(pop) #pragma warning(pop)
#endif #endif
@ -975,50 +965,29 @@ static size_t multibyteToWString(string<wchar_t>& destination, const char* sourc
} }
} }
//! Same as multibyteToWString, but the other way around
static inline size_t wStringToMultibyte(string<c8>& destination, const core::string<wchar_t>& source) inline size_t utf8ToWString(stringw &destination, const char *source)
{ {
return wStringToMultibyte(destination, source.c_str(), (u32)source.size()); std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
destination = conv.from_bytes(source);
return destination.size();
} }
//! Same as multibyteToWString, but the other way around inline size_t utf8ToWString(stringw &destination, const stringc &source)
static inline size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source)
{ {
const u32 s = source ? (u32)wcslen(source) : 0; return utf8ToWString(destination, source.c_str());
return wStringToMultibyte(destination, source, s);
} }
//! Same as multibyteToWString, but the other way around inline size_t wStringToUTF8(stringc &destination, const wchar_t *source)
static size_t wStringToMultibyte(string<c8>& destination, const wchar_t* source, u32 sourceSize)
{ {
if ( sourceSize ) std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
{ destination = conv.to_bytes(source);
destination.str.resize(sizeof(wchar_t)*sourceSize+1); return destination.size();
#if defined(_MSC_VER) }
#pragma warning(push)
#pragma warning(disable: 4996) // 'wcstombs': This function or variable may be unsafe. Consider using wcstombs_s instead. inline size_t wStringToUTF8(stringc &destination, const stringw &source)
#endif {
const size_t written = wcstombs(&destination[0], source, destination.size()); return wStringToUTF8(destination, source.c_str());
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
if ( written != (size_t)-1 )
{
destination.str.resize(written);
}
else
{
// Likely character which got converted until the invalid character was encountered are in destination now.
// And it seems even 0-terminated, but I found no documentation anywhere that this (the 0-termination) is guaranteed :-(
destination.clear();
}
return written;
}
else
{
destination.clear();
return 0;
}
} }

@ -299,7 +299,7 @@ bool CGUIEditBox::processKey(const SEvent& event)
const s32 realmend = MarkBegin < MarkEnd ? MarkEnd : MarkBegin; const s32 realmend = MarkBegin < MarkEnd ? MarkEnd : MarkBegin;
core::stringc s; core::stringc s;
wStringToMultibyte(s, Text.subString(realmbgn, realmend - realmbgn)); wStringToUTF8(s, Text.subString(realmbgn, realmend - realmbgn));
Operator->copyToClipboard(s.c_str()); Operator->copyToClipboard(s.c_str());
} }
break; break;
@ -312,7 +312,7 @@ bool CGUIEditBox::processKey(const SEvent& event)
// copy // copy
core::stringc sc; core::stringc sc;
wStringToMultibyte(sc, Text.subString(realmbgn, realmend - realmbgn)); wStringToUTF8(sc, Text.subString(realmbgn, realmend - realmbgn));
Operator->copyToClipboard(sc.c_str()); Operator->copyToClipboard(sc.c_str());
if (isEnabled()) if (isEnabled())
@ -345,7 +345,7 @@ bool CGUIEditBox::processKey(const SEvent& event)
if (p) if (p)
{ {
irr::core::stringw widep; irr::core::stringw widep;
core::multibyteToWString(widep, p); core::utf8ToWString(widep, p);
if (MarkBegin == MarkEnd) if (MarkBegin == MarkEnd)
{ {
@ -1157,7 +1157,7 @@ bool CGUIEditBox::processMouse(const SEvent& event)
const c8 *inserted_text_utf8 = Operator->getTextFromPrimarySelection(); const c8 *inserted_text_utf8 = Operator->getTextFromPrimarySelection();
if (!inserted_text_utf8) if (!inserted_text_utf8)
return inserted_text; return inserted_text;
core::multibyteToWString(inserted_text, inserted_text_utf8); core::utf8ToWString(inserted_text, inserted_text_utf8);
return inserted_text; return inserted_text;
}()); }());
@ -1659,7 +1659,7 @@ void CGUIEditBox::setTextMarkers(s32 begin, s32 end)
const s32 realmend = MarkBegin < MarkEnd ? MarkEnd : MarkBegin; const s32 realmend = MarkBegin < MarkEnd ? MarkEnd : MarkBegin;
core::stringc s; core::stringc s;
wStringToMultibyte(s, Text.subString(realmbgn, realmend - realmbgn)); wStringToUTF8(s, Text.subString(realmbgn, realmend - realmbgn));
Operator->copyToPrimarySelection(s.c_str()); Operator->copyToPrimarySelection(s.c_str());
} }

@ -673,7 +673,7 @@ bool CIrrDeviceSDL::run()
{ {
irrevent.EventType = irr::EET_STRING_INPUT_EVENT; irrevent.EventType = irr::EET_STRING_INPUT_EVENT;
irrevent.StringInput.Str = new core::stringw(); irrevent.StringInput.Str = new core::stringw();
irr::core::multibyteToWString(*irrevent.StringInput.Str, SDL_event.text.text); irr::core::utf8ToWString(*irrevent.StringInput.Str, SDL_event.text.text);
postEventFromUser(irrevent); postEventFromUser(irrevent);
delete irrevent.StringInput.Str; delete irrevent.StringInput.Str;
irrevent.StringInput.Str = NULL; irrevent.StringInput.Str = NULL;
@ -928,7 +928,7 @@ void CIrrDeviceSDL::sleep(u32 timeMs, bool pauseTimer)
void CIrrDeviceSDL::setWindowCaption(const wchar_t* text) void CIrrDeviceSDL::setWindowCaption(const wchar_t* text)
{ {
core::stringc textc; core::stringc textc;
core::wStringToMultibyte(textc, text); core::wStringToUTF8(textc, text);
SDL_SetWindowTitle(Window, textc.c_str()); SDL_SetWindowTitle(Window, textc.c_str());
} }

@ -50,6 +50,8 @@ elseif(MSVC)
if(CMAKE_SIZEOF_VOID_P EQUAL 4) if(CMAKE_SIZEOF_VOID_P EQUAL 4)
add_compile_options(/arch:SSE) add_compile_options(/arch:SSE)
endif() endif()
add_compile_options(/D_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
endif() endif()
# Sanity-check version # Sanity-check version

@ -94,7 +94,7 @@ void COSOperator::copyToClipboard(const c8 *text) const
EmptyClipboard(); EmptyClipboard();
core::stringw tempbuffer; core::stringw tempbuffer;
core::multibyteToWString(tempbuffer, text); core::utf8ToWString(tempbuffer, text);
const u32 size = (tempbuffer.size() + 1) * sizeof(wchar_t); const u32 size = (tempbuffer.size() + 1) * sizeof(wchar_t);
HGLOBAL clipbuffer; HGLOBAL clipbuffer;
@ -164,7 +164,7 @@ const c8* COSOperator::getTextFromClipboard() const
HANDLE hData = GetClipboardData( CF_UNICODETEXT ); HANDLE hData = GetClipboardData( CF_UNICODETEXT );
buffer = (wchar_t*) GlobalLock( hData ); buffer = (wchar_t*) GlobalLock( hData );
core::wStringToMultibyte(ClipboardBuf, buffer); core::wStringToUTF8(ClipboardBuf, buffer);
GlobalUnlock( hData ); GlobalUnlock( hData );
CloseClipboard(); CloseClipboard();