diff --git a/include/irrUString.h b/include/irrUString.h new file mode 100644 index 0000000..09172ee --- /dev/null +++ b/include/irrUString.h @@ -0,0 +1,3891 @@ +/* + Basic Unicode string class for Irrlicht. + Copyright (c) 2009-2011 John Norman + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any + damages arising from the use of this software. + + Permission is granted to anyone to use this software for any + purpose, including commercial applications, and to alter it and + redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you + must not claim that you wrote the original software. If you use + this software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and + must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. + + The original version of this class can be located at: + http://irrlicht.suckerfreegames.com/ + + John Norman + john@suckerfreegames.com +*/ + +#pragma once + +#if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__) +# define USTRING_CPP0X +# if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))) +# define USTRING_CPP0X_NEWLITERALS +# endif +#endif + +#include +#include +#include +#include + +#ifdef _WIN32 +#define __BYTE_ORDER 0 +#define __LITTLE_ENDIAN 0 +#define __BIG_ENDIAN 1 +#elif defined(__MACH__) && defined(__APPLE__) +#include +#elif defined(__FreeBSD__) || defined(__DragonFly__) +#include +#else +#include +#endif + +#ifdef USTRING_CPP0X +# include +#endif + +#ifndef USTRING_NO_STL +# include +# include +# include +#endif + +#include "irrTypes.h" +#include "irrAllocator.h" +#include "irrArray.h" +#include "irrMath.h" +#include "irrString.h" +#include "path.h" + +//! UTF-16 surrogate start values. +static const irr::u16 UTF16_HI_SURROGATE = 0xD800; +static const irr::u16 UTF16_LO_SURROGATE = 0xDC00; + +//! Is a UTF-16 code point a surrogate? +#define UTF16_IS_SURROGATE(c) (((c) & 0xF800) == 0xD800) +#define UTF16_IS_SURROGATE_HI(c) (((c) & 0xFC00) == 0xD800) +#define UTF16_IS_SURROGATE_LO(c) (((c) & 0xFC00) == 0xDC00) + + +namespace irr +{ + + // Define our character types. +#ifdef USTRING_CPP0X_NEWLITERALS // C++0x + typedef char32_t uchar32_t; + typedef char16_t uchar16_t; + typedef char uchar8_t; +#else + typedef u32 uchar32_t; + typedef u16 uchar16_t; + typedef u8 uchar8_t; +#endif + +namespace core +{ + +namespace unicode +{ + +//! The unicode replacement character. Used to replace invalid characters. +const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD; + +//! Convert a UTF-16 surrogate pair into a UTF-32 character. +//! \param high The high value of the pair. +//! \param low The low value of the pair. +//! \return The UTF-32 character expressed by the surrogate pair. +inline uchar32_t toUTF32(uchar16_t high, uchar16_t low) +{ + // Convert the surrogate pair into a single UTF-32 character. + uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1)); + uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1; + return (wu << 16) | x; +} + +//! Swaps the endianness of a 16-bit value. +//! \return The new value. +inline uchar16_t swapEndian16(const uchar16_t& c) +{ + return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00); +} + +//! Swaps the endianness of a 32-bit value. +//! \return The new value. +inline uchar32_t swapEndian32(const uchar32_t& c) +{ + return ((c >> 24) & 0x000000FF) | + ((c >> 8) & 0x0000FF00) | + ((c << 8) & 0x00FF0000) | + ((c << 24) & 0xFF000000); +} + +//! The Unicode byte order mark. +const u16 BOM = 0xFEFF; + +//! The size of the Unicode byte order mark in terms of the Unicode character size. +const u8 BOM_UTF8_LEN = 3; +const u8 BOM_UTF16_LEN = 1; +const u8 BOM_UTF32_LEN = 1; + +//! Unicode byte order marks for file operations. +const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF }; +const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF }; +const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE }; +const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF }; +const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 }; + +//! The size in bytes of the Unicode byte marks for file operations. +const u8 BOM_ENCODE_UTF8_LEN = 3; +const u8 BOM_ENCODE_UTF16_LEN = 2; +const u8 BOM_ENCODE_UTF32_LEN = 4; + +//! Unicode encoding type. +enum EUTF_ENCODE +{ + EUTFE_NONE = 0, + EUTFE_UTF8, + EUTFE_UTF16, + EUTFE_UTF16_LE, + EUTFE_UTF16_BE, + EUTFE_UTF32, + EUTFE_UTF32_LE, + EUTFE_UTF32_BE +}; + +//! Unicode endianness. +enum EUTF_ENDIAN +{ + EUTFEE_NATIVE = 0, + EUTFEE_LITTLE, + EUTFEE_BIG +}; + +//! Returns the specified unicode byte order mark in a byte array. +//! The byte order mark is the first few bytes in a text file that signifies its encoding. +/** \param mode The Unicode encoding method that we want to get the byte order mark for. + If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/ +//! \return An array that contains a byte order mark. +inline core::array getUnicodeBOM(EUTF_ENCODE mode) +{ +#define COPY_ARRAY(source, size) \ + memcpy(ret.pointer(), source, size); \ + ret.set_used(size) + + core::array ret(4); + switch (mode) + { + case EUTFE_UTF8: + COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN); + break; + case EUTFE_UTF16: + #ifdef __BIG_ENDIAN__ + COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN); + #else + COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN); + #endif + break; + case EUTFE_UTF16_BE: + COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN); + break; + case EUTFE_UTF16_LE: + COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN); + break; + case EUTFE_UTF32: + #ifdef __BIG_ENDIAN__ + COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN); + #else + COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN); + #endif + break; + case EUTFE_UTF32_BE: + COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN); + break; + case EUTFE_UTF32_LE: + COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN); + break; + case EUTFE_NONE: + // TODO sapier: fixed warning only, + // don't know if something needs to be done here + break; + } + return ret; + +#undef COPY_ARRAY +} + +//! Detects if the given data stream starts with a unicode BOM. +//! \param data The data stream to check. +//! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found. +inline EUTF_ENCODE determineUnicodeBOM(const char* data) +{ + if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8; + if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE; + if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE; + if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE; + if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE; + return EUTFE_NONE; +} + +} // end namespace unicode + + +//! UTF-16 string class. +template > +class ustring16 +{ +public: + + ///------------------/// + /// iterator classes /// + ///------------------/// + + //! Access an element in a unicode string, allowing one to change it. + class _ustring16_iterator_access + { + public: + _ustring16_iterator_access(const ustring16* s, u32 p) : ref(s), pos(p) {} + + //! Allow the class to be interpreted as a single UTF-32 character. + operator uchar32_t() const + { + return _get(); + } + + //! Allow one to change the character in the unicode string. + //! \param c The new character to use. + //! \return Myself. + _ustring16_iterator_access& operator=(const uchar32_t c) + { + _set(c); + return *this; + } + + //! Increments the value by 1. + //! \return Myself. + _ustring16_iterator_access& operator++() + { + _set(_get() + 1); + return *this; + } + + //! Increments the value by 1, returning the old value. + //! \return A unicode character. + uchar32_t operator++(int) + { + uchar32_t old = _get(); + _set(old + 1); + return old; + } + + //! Decrements the value by 1. + //! \return Myself. + _ustring16_iterator_access& operator--() + { + _set(_get() - 1); + return *this; + } + + //! Decrements the value by 1, returning the old value. + //! \return A unicode character. + uchar32_t operator--(int) + { + uchar32_t old = _get(); + _set(old - 1); + return old; + } + + //! Adds to the value by a specified amount. + //! \param val The amount to add to this character. + //! \return Myself. + _ustring16_iterator_access& operator+=(int val) + { + _set(_get() + val); + return *this; + } + + //! Subtracts from the value by a specified amount. + //! \param val The amount to subtract from this character. + //! \return Myself. + _ustring16_iterator_access& operator-=(int val) + { + _set(_get() - val); + return *this; + } + + //! Multiples the value by a specified amount. + //! \param val The amount to multiply this character by. + //! \return Myself. + _ustring16_iterator_access& operator*=(int val) + { + _set(_get() * val); + return *this; + } + + //! Divides the value by a specified amount. + //! \param val The amount to divide this character by. + //! \return Myself. + _ustring16_iterator_access& operator/=(int val) + { + _set(_get() / val); + return *this; + } + + //! Modulos the value by a specified amount. + //! \param val The amount to modulo this character by. + //! \return Myself. + _ustring16_iterator_access& operator%=(int val) + { + _set(_get() % val); + return *this; + } + + //! Adds to the value by a specified amount. + //! \param val The amount to add to this character. + //! \return A unicode character. + uchar32_t operator+(int val) const + { + return _get() + val; + } + + //! Subtracts from the value by a specified amount. + //! \param val The amount to subtract from this character. + //! \return A unicode character. + uchar32_t operator-(int val) const + { + return _get() - val; + } + + //! Multiplies the value by a specified amount. + //! \param val The amount to multiply this character by. + //! \return A unicode character. + uchar32_t operator*(int val) const + { + return _get() * val; + } + + //! Divides the value by a specified amount. + //! \param val The amount to divide this character by. + //! \return A unicode character. + uchar32_t operator/(int val) const + { + return _get() / val; + } + + //! Modulos the value by a specified amount. + //! \param val The amount to modulo this character by. + //! \return A unicode character. + uchar32_t operator%(int val) const + { + return _get() % val; + } + + private: + //! Gets a uchar32_t from our current position. + uchar32_t _get() const + { + const uchar16_t* a = ref->c_str(); + if (!UTF16_IS_SURROGATE(a[pos])) + return static_cast(a[pos]); + else + { + if (pos + 1 >= ref->size_raw()) + return 0; + + return unicode::toUTF32(a[pos], a[pos + 1]); + } + } + + //! Sets a uchar32_t at our current position. + void _set(uchar32_t c) + { + ustring16* ref2 = const_cast*>(ref); + const uchar16_t* a = ref2->c_str(); + if (c > 0xFFFF) + { + // c will be multibyte, so split it up into the high and low surrogate pairs. + uchar16_t x = static_cast(c); + uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10); + uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)); + + // If the previous position was a surrogate pair, just replace them. Else, insert the low pair. + if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw()) + ref2->replace_raw(vl, static_cast(pos) + 1); + else ref2->insert_raw(vl, static_cast(pos) + 1); + + ref2->replace_raw(vh, static_cast(pos)); + } + else + { + // c will be a single byte. + uchar16_t vh = static_cast(c); + + // If the previous position was a surrogate pair, remove the extra byte. + if (UTF16_IS_SURROGATE_HI(a[pos])) + ref2->erase_raw(static_cast(pos) + 1); + + ref2->replace_raw(vh, static_cast(pos)); + } + } + + const ustring16* ref; + u32 pos; + }; + typedef typename ustring16::_ustring16_iterator_access access; + + + //! Iterator to iterate through a UTF-16 string. +#ifndef USTRING_NO_STL + class _ustring16_const_iterator : public std::iterator< + std::bidirectional_iterator_tag, // iterator_category + access, // value_type + ptrdiff_t, // difference_type + const access, // pointer + const access // reference + > +#else + class _ustring16_const_iterator +#endif + { + public: + typedef _ustring16_const_iterator _Iter; + typedef std::iterator _Base; + typedef const access const_pointer; + typedef const access const_reference; + +#ifndef USTRING_NO_STL + typedef typename _Base::value_type value_type; + typedef typename _Base::difference_type difference_type; + typedef typename _Base::difference_type distance_type; + typedef typename _Base::pointer pointer; + typedef const_reference reference; +#else + typedef access value_type; + typedef u32 difference_type; + typedef u32 distance_type; + typedef const_pointer pointer; + typedef const_reference reference; +#endif + + //! Constructors. + _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {} + _ustring16_const_iterator(const ustring16& s) : ref(&s), pos(0) {} + _ustring16_const_iterator(const ustring16& s, const u32 p) : ref(&s), pos(0) + { + if (ref->size_raw() == 0 || p == 0) + return; + + // Go to the appropriate position. + u32 i = p; + u32 sr = ref->size_raw(); + const uchar16_t* a = ref->c_str(); + while (i != 0 && pos < sr) + { + if (UTF16_IS_SURROGATE_HI(a[pos])) + pos += 2; + else ++pos; + --i; + } + } + + //! Test for equalness. + bool operator==(const _Iter& iter) const + { + if (ref == iter.ref && pos == iter.pos) + return true; + return false; + } + + //! Test for unequalness. + bool operator!=(const _Iter& iter) const + { + if (ref != iter.ref || pos != iter.pos) + return true; + return false; + } + + //! Switch to the next full character in the string. + _Iter& operator++() + { // ++iterator + if (pos == ref->size_raw()) return *this; + const uchar16_t* a = ref->c_str(); + if (UTF16_IS_SURROGATE_HI(a[pos])) + pos += 2; // TODO: check for valid low surrogate? + else ++pos; + if (pos > ref->size_raw()) pos = ref->size_raw(); + return *this; + } + + //! Switch to the next full character in the string, returning the previous position. + _Iter operator++(int) + { // iterator++ + _Iter _tmp(*this); + ++*this; + return _tmp; + } + + //! Switch to the previous full character in the string. + _Iter& operator--() + { // --iterator + if (pos == 0) return *this; + const uchar16_t* a = ref->c_str(); + --pos; + if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0) // low surrogate, go back one more. + --pos; + return *this; + } + + //! Switch to the previous full character in the string, returning the previous position. + _Iter operator--(int) + { // iterator-- + _Iter _tmp(*this); + --*this; + return _tmp; + } + + //! Advance a specified number of full characters in the string. + //! \return Myself. + _Iter& operator+=(const difference_type v) + { + if (v == 0) return *this; + if (v < 0) return operator-=(v * -1); + + if (pos >= ref->size_raw()) + return *this; + + // Go to the appropriate position. + // TODO: Don't force u32 on an x64 OS. Make it agnostic. + u32 i = (u32)v; + u32 sr = ref->size_raw(); + const uchar16_t* a = ref->c_str(); + while (i != 0 && pos < sr) + { + if (UTF16_IS_SURROGATE_HI(a[pos])) + pos += 2; + else ++pos; + --i; + } + if (pos > sr) + pos = sr; + + return *this; + } + + //! Go back a specified number of full characters in the string. + //! \return Myself. + _Iter& operator-=(const difference_type v) + { + if (v == 0) return *this; + if (v > 0) return operator+=(v * -1); + + if (pos == 0) + return *this; + + // Go to the appropriate position. + // TODO: Don't force u32 on an x64 OS. Make it agnostic. + u32 i = (u32)v; + const uchar16_t* a = ref->c_str(); + while (i != 0 && pos != 0) + { + --pos; + if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0) + --pos; + --i; + } + + return *this; + } + + //! Return a new iterator that is a variable number of full characters forward from the current position. + _Iter operator+(const difference_type v) const + { + _Iter ret(*this); + ret += v; + return ret; + } + + //! Return a new iterator that is a variable number of full characters backward from the current position. + _Iter operator-(const difference_type v) const + { + _Iter ret(*this); + ret -= v; + return ret; + } + + //! Returns the distance between two iterators. + difference_type operator-(const _Iter& iter) const + { + // Make sure we reference the same object! + if (ref != iter.ref) + return difference_type(); + + _Iter i = iter; + difference_type ret; + + // Walk up. + if (pos > i.pos) + { + while (pos > i.pos) + { + ++i; + ++ret; + } + return ret; + } + + // Walk down. + while (pos < i.pos) + { + --i; + --ret; + } + return ret; + } + + //! Accesses the full character at the iterator's position. + const_reference operator*() const + { + if (pos >= ref->size_raw()) + { + const uchar16_t* a = ref->c_str(); + u32 p = ref->size_raw(); + if (UTF16_IS_SURROGATE_LO(a[p])) + --p; + reference ret(ref, p); + return ret; + } + const_reference ret(ref, pos); + return ret; + } + + //! Accesses the full character at the iterator's position. + reference operator*() + { + if (pos >= ref->size_raw()) + { + const uchar16_t* a = ref->c_str(); + u32 p = ref->size_raw(); + if (UTF16_IS_SURROGATE_LO(a[p])) + --p; + reference ret(ref, p); + return ret; + } + reference ret(ref, pos); + return ret; + } + + //! Accesses the full character at the iterator's position. + const_pointer operator->() const + { + return operator*(); + } + + //! Accesses the full character at the iterator's position. + pointer operator->() + { + return operator*(); + } + + //! Is the iterator at the start of the string? + bool atStart() const + { + return pos == 0; + } + + //! Is the iterator at the end of the string? + bool atEnd() const + { + const uchar16_t* a = ref->c_str(); + if (UTF16_IS_SURROGATE(a[pos])) + return (pos + 1) >= ref->size_raw(); + else return pos >= ref->size_raw(); + } + + //! Moves the iterator to the start of the string. + void toStart() + { + pos = 0; + } + + //! Moves the iterator to the end of the string. + void toEnd() + { + pos = ref->size_raw(); + } + + //! Returns the iterator's position. + //! \return The iterator's position. + u32 getPos() const + { + return pos; + } + + protected: + const ustring16* ref; + u32 pos; + }; + + //! Iterator to iterate through a UTF-16 string. + class _ustring16_iterator : public _ustring16_const_iterator + { + public: + typedef _ustring16_iterator _Iter; + typedef _ustring16_const_iterator _Base; + typedef typename _Base::const_pointer const_pointer; + typedef typename _Base::const_reference const_reference; + + + typedef typename _Base::value_type value_type; + typedef typename _Base::difference_type difference_type; + typedef typename _Base::distance_type distance_type; + typedef access pointer; + typedef access reference; + + using _Base::pos; + using _Base::ref; + + //! Constructors. + _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {} + _ustring16_iterator(const ustring16& s) : _ustring16_const_iterator(s) {} + _ustring16_iterator(const ustring16& s, const u32 p) : _ustring16_const_iterator(s, p) {} + + //! Accesses the full character at the iterator's position. + reference operator*() const + { + if (pos >= ref->size_raw()) + { + const uchar16_t* a = ref->c_str(); + u32 p = ref->size_raw(); + if (UTF16_IS_SURROGATE_LO(a[p])) + --p; + reference ret(ref, p); + return ret; + } + reference ret(ref, pos); + return ret; + } + + //! Accesses the full character at the iterator's position. + reference operator*() + { + if (pos >= ref->size_raw()) + { + const uchar16_t* a = ref->c_str(); + u32 p = ref->size_raw(); + if (UTF16_IS_SURROGATE_LO(a[p])) + --p; + reference ret(ref, p); + return ret; + } + reference ret(ref, pos); + return ret; + } + + //! Accesses the full character at the iterator's position. + pointer operator->() const + { + return operator*(); + } + + //! Accesses the full character at the iterator's position. + pointer operator->() + { + return operator*(); + } + }; + + typedef typename ustring16::_ustring16_iterator iterator; + typedef typename ustring16::_ustring16_const_iterator const_iterator; + + ///----------------------/// + /// end iterator classes /// + ///----------------------/// + + //! Default constructor + ustring16() + : array(0), allocated(1), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + array = allocator.allocate(1); // new u16[1]; + array[0] = 0x0; + } + + + //! Constructor + ustring16(const ustring16& other) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + *this = other; + } + + + //! Constructor from other string types + template + ustring16(const string& other) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + *this = other; + } + + +#ifndef USTRING_NO_STL + //! Constructor from std::string + template + ustring16(const std::basic_string& other) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + *this = other.c_str(); + } + + + //! Constructor from iterator. + template + ustring16(Itr first, Itr last) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + reserve(std::distance(first, last)); + array[used] = 0; + + for (; first != last; ++first) + append((uchar32_t)*first); + } +#endif + + +#ifndef USTRING_CPP0X_NEWLITERALS + //! Constructor for copying a character string from a pointer. + ustring16(const char* const c) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + loadDataStream(c, strlen(c)); + //append((uchar8_t*)c); + } + + + //! Constructor for copying a character string from a pointer with a given length. + ustring16(const char* const c, u32 length) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + loadDataStream(c, length); + } +#endif + + + //! Constructor for copying a UTF-8 string from a pointer. + ustring16(const uchar8_t* const c) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append(c); + } + + + //! Constructor for copying a UTF-8 string from a single char. + ustring16(const char c) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append((uchar32_t)c); + } + + + //! Constructor for copying a UTF-8 string from a pointer with a given length. + ustring16(const uchar8_t* const c, u32 length) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append(c, length); + } + + + //! Constructor for copying a UTF-16 string from a pointer. + ustring16(const uchar16_t* const c) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append(c); + } + + + //! Constructor for copying a UTF-16 string from a pointer with a given length + ustring16(const uchar16_t* const c, u32 length) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append(c, length); + } + + + //! Constructor for copying a UTF-32 string from a pointer. + ustring16(const uchar32_t* const c) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append(c); + } + + + //! Constructor for copying a UTF-32 from a pointer with a given length. + ustring16(const uchar32_t* const c, u32 length) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + append(c, length); + } + + + //! Constructor for copying a wchar_t string from a pointer. + ustring16(const wchar_t* const c) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + if (sizeof(wchar_t) == 4) + append(reinterpret_cast(c)); + else if (sizeof(wchar_t) == 2) + append(reinterpret_cast(c)); + else if (sizeof(wchar_t) == 1) + append(reinterpret_cast(c)); + } + + + //! Constructor for copying a wchar_t string from a pointer with a given length. + ustring16(const wchar_t* const c, u32 length) + : array(0), allocated(0), used(0) + { +#if __BYTE_ORDER == __BIG_ENDIAN + encoding = unicode::EUTFE_UTF16_BE; +#else + encoding = unicode::EUTFE_UTF16_LE; +#endif + + if (sizeof(wchar_t) == 4) + append(reinterpret_cast(c), length); + else if (sizeof(wchar_t) == 2) + append(reinterpret_cast(c), length); + else if (sizeof(wchar_t) == 1) + append(reinterpret_cast(c), length); + } + + +#ifdef USTRING_CPP0X + //! Constructor for moving a ustring16 + ustring16(ustring16&& other) + : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used) + { + //std::cout << "MOVE constructor" << std::endl; + other.array = 0; + other.allocated = 0; + other.used = 0; + } +#endif + + + //! Destructor + ~ustring16() + { + allocator.deallocate(array); // delete [] array; + } + + + //! Assignment operator + ustring16& operator=(const ustring16& other) + { + if (this == &other) + return *this; + + used = other.size_raw(); + if (used >= allocated) + { + allocator.deallocate(array); // delete [] array; + allocated = used + 1; + array = allocator.allocate(used + 1); //new u16[used]; + } + + const uchar16_t* p = other.c_str(); + for (u32 i=0; i<=used; ++i, ++p) + array[i] = *p; + + array[used] = 0; + + // Validate our new UTF-16 string. + validate(); + + return *this; + } + + +#ifdef USTRING_CPP0X + //! Move assignment operator + ustring16& operator=(ustring16&& other) + { + if (this != &other) + { + //std::cout << "MOVE operator=" << std::endl; + allocator.deallocate(array); + + array = other.array; + allocated = other.allocated; + encoding = other.encoding; + used = other.used; + other.array = 0; + other.used = 0; + } + return *this; + } +#endif + + + //! Assignment operator for other string types + template + ustring16& operator=(const string& other) + { + *this = other.c_str(); + return *this; + } + + + //! Assignment operator for UTF-8 strings + ustring16& operator=(const uchar8_t* const c) + { + if (!array) + { + array = allocator.allocate(1); //new u16[1]; + allocated = 1; + } + used = 0; + array[used] = 0x0; + if (!c) return *this; + + //! Append our string now. + append(c); + return *this; + } + + + //! Assignment operator for UTF-16 strings + ustring16& operator=(const uchar16_t* const c) + { + if (!array) + { + array = allocator.allocate(1); //new u16[1]; + allocated = 1; + } + used = 0; + array[used] = 0x0; + if (!c) return *this; + + //! Append our string now. + append(c); + return *this; + } + + + //! Assignment operator for UTF-32 strings + ustring16& operator=(const uchar32_t* const c) + { + if (!array) + { + array = allocator.allocate(1); //new u16[1]; + allocated = 1; + } + used = 0; + array[used] = 0x0; + if (!c) return *this; + + //! Append our string now. + append(c); + return *this; + } + + + //! Assignment operator for wchar_t strings. + /** Note that this assumes that a correct unicode string is stored in the wchar_t string. + Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string. + This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/ + ustring16& operator=(const wchar_t* const c) + { + if (sizeof(wchar_t) == 4) + *this = reinterpret_cast(c); + else if (sizeof(wchar_t) == 2) + *this = reinterpret_cast(c); + else if (sizeof(wchar_t) == 1) + *this = reinterpret_cast(c); + + return *this; + } + + + //! Assignment operator for other strings. + /** Note that this assumes that a correct unicode string is stored in the string. **/ + template + ustring16& operator=(const B* const c) + { + if (sizeof(B) == 4) + *this = reinterpret_cast(c); + else if (sizeof(B) == 2) + *this = reinterpret_cast(c); + else if (sizeof(B) == 1) + *this = reinterpret_cast(c); + + return *this; + } + + + //! Direct access operator + access operator [](const u32 index) + { + _IRR_DEBUG_BREAK_IF(index>=size()) // bad index + iterator iter(*this, index); + return iter.operator*(); + } + + + //! Direct access operator + const access operator [](const u32 index) const + { + _IRR_DEBUG_BREAK_IF(index>=size()) // bad index + const_iterator iter(*this, index); + return iter.operator*(); + } + + + //! Equality operator + bool operator ==(const uchar16_t* const str) const + { + if (!str) + return false; + + u32 i; + for(i=0; array[i] && str[i]; ++i) + if (array[i] != str[i]) + return false; + + return !array[i] && !str[i]; + } + + + //! Equality operator + bool operator ==(const ustring16& other) const + { + for(u32 i=0; array[i] && other.array[i]; ++i) + if (array[i] != other.array[i]) + return false; + + return used == other.used; + } + + + //! Is smaller comparator + bool operator <(const ustring16& other) const + { + for(u32 i=0; array[i] && other.array[i]; ++i) + { + s32 diff = array[i] - other.array[i]; + if ( diff ) + return diff < 0; + } + + return used < other.used; + } + + + //! Inequality operator + bool operator !=(const uchar16_t* const str) const + { + return !(*this == str); + } + + + //! Inequality operator + bool operator !=(const ustring16& other) const + { + return !(*this == other); + } + + + //! Returns the length of a ustring16 in full characters. + //! \return Length of a ustring16 in full characters. + u32 size() const + { + const_iterator i(*this, 0); + u32 pos = 0; + while (!i.atEnd()) + { + ++i; + ++pos; + } + return pos; + } + + + //! Informs if the ustring is empty or not. + //! \return True if the ustring is empty, false if not. + bool empty() const + { + return (size_raw() == 0); + } + + + //! Returns a pointer to the raw UTF-16 string data. + //! \return pointer to C-style NUL terminated array of UTF-16 code points. + const uchar16_t* c_str() const + { + return array; + } + + + //! Compares the first n characters of this string with another. + //! \param other Other string to compare to. + //! \param n Number of characters to compare. + //! \return True if the n first characters of both strings are equal. + bool equalsn(const ustring16& other, u32 n) const + { + u32 i; + const uchar16_t* oa = other.c_str(); + for(i=0; i < n && array[i] && oa[i]; ++i) + if (array[i] != oa[i]) + return false; + + // if one (or both) of the strings was smaller then they + // are only equal if they have the same length + return (i == n) || (used == other.used); + } + + + //! Compares the first n characters of this string with another. + //! \param str Other string to compare to. + //! \param n Number of characters to compare. + //! \return True if the n first characters of both strings are equal. + bool equalsn(const uchar16_t* const str, u32 n) const + { + if (!str) + return false; + u32 i; + for(i=0; i < n && array[i] && str[i]; ++i) + if (array[i] != str[i]) + return false; + + // if one (or both) of the strings was smaller then they + // are only equal if they have the same length + return (i == n) || (array[i] == 0 && str[i] == 0); + } + + + //! Appends a character to this ustring16 + //! \param character The character to append. + //! \return A reference to our current string. + ustring16& append(uchar32_t character) + { + if (used + 2 >= allocated) + reallocate(used + 2); + + if (character > 0xFFFF) + { + used += 2; + + // character will be multibyte, so split it up into a surrogate pair. + uchar16_t x = static_cast(character); + uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10); + uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)); + array[used-2] = vh; + array[used-1] = vl; + } + else + { + ++used; + array[used-1] = character; + } + array[used] = 0; + + return *this; + } + + + //! Appends a UTF-8 string to this ustring16 + //! \param other The UTF-8 string to append. + //! \param length The length of the string to append. + //! \return A reference to our current string. + ustring16& append(const uchar8_t* const other, u32 length=0xffffffff) + { + if (!other) + return *this; + + // Determine if the string is long enough for a BOM. + u32 len = 0; + const uchar8_t* p = other; + do + { + ++len; + } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN); + + // Check for BOM. + unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE; + if (len == unicode::BOM_ENCODE_UTF8_LEN) + { + if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0) + c_bom = unicode::EUTFE_UTF8; + } + + // If a BOM was found, don't include it in the string. + const uchar8_t* c2 = other; + if (c_bom != unicode::EUTFE_NONE) + { + c2 = other + unicode::BOM_UTF8_LEN; + length -= unicode::BOM_UTF8_LEN; + } + + // Calculate the size of the string to read in. + len = 0; + p = c2; + do + { + ++len; + } while(*p++ && len < length); + if (len > length) + len = length; + + // If we need to grow the array, do it now. + if (used + len >= allocated) + reallocate(used + (len * 2)); + u32 start = used; + + // Convert UTF-8 to UTF-16. + u32 pos = start; + for (u32 l = 0; l> 6) & 0x03) == 0x02) + { // Invalid continuation byte. + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + ++l; + } + else if (c2[l] == 0xC0 || c2[l] == 0xC1) + { // Invalid byte - overlong encoding. + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + ++l; + } + else if ((c2[l] & 0xF8) == 0xF0) + { // 4 bytes UTF-8, 2 bytes UTF-16. + // Check for a full string. + if ((l + 3) >= len) + { + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + l += 3; + break; + } + + // Validate. + bool valid = true; + u8 l2 = 0; + if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false; + if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false; + if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false; + if (!valid) + { + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + l += l2; + continue; + } + + // Decode. + uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3); + uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF); + uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F); + uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16); + + // Split v up into a surrogate pair. + uchar16_t x = static_cast(v); + uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10); + uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)); + + array[pos++] = vh; + array[pos++] = vl; + l += 4; + ++used; // Using two shorts this time, so increase used by 1. + } + else if ((c2[l] & 0xF0) == 0xE0) + { // 3 bytes UTF-8, 1 byte UTF-16. + // Check for a full string. + if ((l + 2) >= len) + { + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + l += 2; + break; + } + + // Validate. + bool valid = true; + u8 l2 = 0; + if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false; + if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false; + if (!valid) + { + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + l += l2; + continue; + } + + // Decode. + uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF); + uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F); + uchar16_t ch = b2 | ((uchar16_t)b1 << 8); + array[pos++] = ch; + l += 3; + } + else if ((c2[l] & 0xE0) == 0xC0) + { // 2 bytes UTF-8, 1 byte UTF-16. + // Check for a full string. + if ((l + 1) >= len) + { + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + l += 1; + break; + } + + // Validate. + if (((c2[l+1] >> 6) & 0x03) != 0x02) + { + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + ++l; + continue; + } + + // Decode. + uchar8_t b1 = (c2[l] >> 2) & 0x7; + uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F); + uchar16_t ch = b2 | ((uchar16_t)b1 << 8); + array[pos++] = ch; + l += 2; + } + else + { // 1 byte UTF-8, 1 byte UTF-16. + // Validate. + if (c2[l] > 0x7F) + { // Values above 0xF4 are restricted and aren't used. By now, anything above 0x7F is invalid. + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + } + else array[pos++] = static_cast(c2[l]); + ++l; + } + } + array[used] = 0; + + // Validate our new UTF-16 string. + validate(); + + return *this; + } + + + //! Appends a UTF-16 string to this ustring16 + //! \param other The UTF-16 string to append. + //! \param length The length of the string to append. + //! \return A reference to our current string. + ustring16& append(const uchar16_t* const other, u32 length=0xffffffff) + { + if (!other) + return *this; + + // Determine if the string is long enough for a BOM. + u32 len = 0; + const uchar16_t* p = other; + do + { + ++len; + } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN); + + // Check for the BOM to determine the string's endianness. + unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE; + if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0) + c_end = unicode::EUTFEE_LITTLE; + else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0) + c_end = unicode::EUTFEE_BIG; + + // If a BOM was found, don't include it in the string. + const uchar16_t* c2 = other; + if (c_end != unicode::EUTFEE_NATIVE) + { + c2 = other + unicode::BOM_UTF16_LEN; + length -= unicode::BOM_UTF16_LEN; + } + + // Calculate the size of the string to read in. + len = 0; + p = c2; + do + { + ++len; + } while(*p++ && len < length); + if (len > length) + len = length; + + // If we need to grow the size of the array, do it now. + if (used + len >= allocated) + reallocate(used + (len * 2)); + u32 start = used; + used += len; + + // Copy the string now. + unicode::EUTF_ENDIAN m_end = getEndianness(); + for (u32 l = start; l < start + len; ++l) + { + array[l] = (uchar16_t)c2[l]; + if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end) + array[l] = unicode::swapEndian16(array[l]); + } + + array[used] = 0; + + // Validate our new UTF-16 string. + validate(); + return *this; + } + + + //! Appends a UTF-32 string to this ustring16 + //! \param other The UTF-32 string to append. + //! \param length The length of the string to append. + //! \return A reference to our current string. + ustring16& append(const uchar32_t* const other, u32 length=0xffffffff) + { + if (!other) + return *this; + + // Check for the BOM to determine the string's endianness. + unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE; + if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0) + c_end = unicode::EUTFEE_LITTLE; + else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0) + c_end = unicode::EUTFEE_BIG; + + // If a BOM was found, don't include it in the string. + const uchar32_t* c2 = other; + if (c_end != unicode::EUTFEE_NATIVE) + { + c2 = other + unicode::BOM_UTF32_LEN; + length -= unicode::BOM_UTF32_LEN; + } + + // Calculate the size of the string to read in. + u32 len = 0; + const uchar32_t* p = c2; + do + { + ++len; + } while(*p++ && len < length); + if (len > length) + len = length; + + // If we need to grow the size of the array, do it now. + // In case all of the UTF-32 string is split into surrogate pairs, do len * 2. + if (used + (len * 2) >= allocated) + reallocate(used + ((len * 2) * 2)); + u32 start = used; + + // Convert UTF-32 to UTF-16. + unicode::EUTF_ENDIAN m_end = getEndianness(); + u32 pos = start; + for (u32 l = 0; l 0xFFFF) + { + // Split ch up into a surrogate pair as it is over 16 bits long. + uchar16_t x = static_cast(ch); + uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10); + uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)); + array[pos++] = vh; + array[pos++] = vl; + ++used; // Using two shorts, so increased used again. + } + else if (ch >= 0xD800 && ch <= 0xDFFF) + { + // Between possible UTF-16 surrogates (invalid!) + array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER; + } + else array[pos++] = static_cast(ch); + } + array[used] = 0; + + // Validate our new UTF-16 string. + validate(); + + return *this; + } + + + //! Appends a ustring16 to this ustring16 + //! \param other The string to append to this one. + //! \return A reference to our current string. + ustring16& append(const ustring16& other) + { + const uchar16_t* oa = other.c_str(); + + u32 len = other.size_raw(); + + if (used + len >= allocated) + reallocate(used + len); + + for (u32 l=0; l& append(const ustring16& other, u32 length) + { + if (other.size() == 0) + return *this; + + if (other.size() < length) + { + append(other); + return *this; + } + + if (used + length * 2 >= allocated) + reallocate(used + length * 2); + + const_iterator iter(other, 0); + u32 l = length; + while (!iter.atEnd() && l) + { + uchar32_t c = *iter; + append(c); + ++iter; + --l; + } + + return *this; + } + + + //! Reserves some memory. + //! \param count The amount of characters to reserve. + void reserve(u32 count) + { + if (count < allocated) + return; + + reallocate(count); + } + + + //! Finds first occurrence of character. + //! \param c The character to search for. + //! \return Position where the character has been found, or -1 if not found. + s32 findFirst(uchar32_t c) const + { + const_iterator i(*this, 0); + + s32 pos = 0; + while (!i.atEnd()) + { + uchar32_t t = *i; + if (c == t) + return pos; + ++pos; + ++i; + } + + return -1; + } + + //! Finds first occurrence of a character of a list. + //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab". + //! \param count The amount of characters in the list. Usually, this should be strlen(c). + //! \return Position where one of the characters has been found, or -1 if not found. + s32 findFirstChar(const uchar32_t* const c, u32 count=1) const + { + if (!c || !count) + return -1; + + const_iterator i(*this, 0); + + s32 pos = 0; + while (!i.atEnd()) + { + uchar32_t t = *i; + for (u32 j=0; j& str, const u32 start = 0) const + { + u32 my_size = size(); + u32 their_size = str.size(); + + if (their_size == 0 || my_size - start < their_size) + return -1; + + const_iterator i(*this, start); + + s32 pos = start; + while (!i.atEnd()) + { + const_iterator i2(i); + const_iterator j(str, 0); + uchar32_t t1 = (uchar32_t)*i2; + uchar32_t t2 = (uchar32_t)*j; + while (t1 == t2) + { + ++i2; + ++j; + if (j.atEnd()) + return pos; + t1 = (uchar32_t)*i2; + t2 = (uchar32_t)*j; + } + ++i; + ++pos; + } + + return -1; + } + + + //! Finds another ustring16 in this ustring16. + //! \param str The string to find. + //! \param start The start position of the search. + //! \return Positions where the string has been found, or -1 if not found. + s32 find_raw(const ustring16& str, const u32 start = 0) const + { + const uchar16_t* data = str.c_str(); + if (data && *data) + { + u32 len = 0; + + while (data[len]) + ++len; + + if (len > used) + return -1; + + for (u32 i=start; i<=used-len; ++i) + { + u32 j=0; + + while(data[j] && array[i+j] == data[j]) + ++j; + + if (!data[j]) + return i; + } + } + + return -1; + } + + + //! Returns a substring. + //! \param begin: Start of substring. + //! \param length: Length of substring. + //! \return A reference to our current string. + ustring16 subString(u32 begin, s32 length) const + { + u32 len = size(); + // if start after ustring16 + // or no proper substring length + if ((length <= 0) || (begin>=len)) + return ustring16(""); + // clamp length to maximal value + if ((length+begin) > len) + length = len-begin; + + ustring16 o; + o.reserve((length+1) * 2); + + const_iterator i(*this, begin); + while (!i.atEnd() && length) + { + o.append(*i); + ++i; + --length; + } + + return o; + } + + + //! Appends a character to this ustring16. + //! \param c Character to append. + //! \return A reference to our current string. + ustring16& operator += (char c) + { + append((uchar32_t)c); + return *this; + } + + + //! Appends a character to this ustring16. + //! \param c Character to append. + //! \return A reference to our current string. + ustring16& operator += (uchar32_t c) + { + append(c); + return *this; + } + + + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (short c) + { + append(core::stringc(c)); + return *this; + } + + + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (unsigned short c) + { + append(core::stringc(c)); + return *this; + } + + +#ifdef USTRING_CPP0X_NEWLITERALS + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (int c) + { + append(core::stringc(c)); + return *this; + } + + + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (unsigned int c) + { + append(core::stringc(c)); + return *this; + } +#endif + + + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (long c) + { + append(core::stringc(c)); + return *this; + } + + + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (unsigned long c) + { + append(core::stringc(c)); + return *this; + } + + + //! Appends a number to this ustring16. + //! \param c Number to append. + //! \return A reference to our current string. + ustring16& operator += (double c) + { + append(core::stringc(c)); + return *this; + } + + + //! Appends a char ustring16 to this ustring16. + //! \param c Char ustring16 to append. + //! \return A reference to our current string. + ustring16& operator += (const uchar16_t* const c) + { + append(c); + return *this; + } + + + //! Appends a ustring16 to this ustring16. + //! \param other ustring16 to append. + //! \return A reference to our current string. + ustring16& operator += (const ustring16& other) + { + append(other); + return *this; + } + + + //! Replaces all characters of a given type with another one. + //! \param toReplace Character to replace. + //! \param replaceWith Character replacing the old one. + //! \return A reference to our current string. + ustring16& replace(uchar32_t toReplace, uchar32_t replaceWith) + { + iterator i(*this, 0); + while (!i.atEnd()) + { + typename ustring16::access a = *i; + if ((uchar32_t)a == toReplace) + a = replaceWith; + ++i; + } + return *this; + } + + + //! Replaces all instances of a string with another one. + //! \param toReplace The string to replace. + //! \param replaceWith The string replacing the old one. + //! \return A reference to our current string. + ustring16& replace(const ustring16& toReplace, const ustring16& replaceWith) + { + if (toReplace.size() == 0) + return *this; + + const uchar16_t* other = toReplace.c_str(); + const uchar16_t* replace = replaceWith.c_str(); + const u32 other_size = toReplace.size_raw(); + const u32 replace_size = replaceWith.size_raw(); + + // Determine the delta. The algorithm will change depending on the delta. + s32 delta = replace_size - other_size; + + // A character for character replace. The string will not shrink or grow. + if (delta == 0) + { + s32 pos = 0; + while ((pos = find_raw(other, pos)) != -1) + { + for (u32 i = 0; i < replace_size; ++i) + array[pos + i] = replace[i]; + ++pos; + } + return *this; + } + + // We are going to be removing some characters. The string will shrink. + if (delta < 0) + { + u32 i = 0; + for (u32 pos = 0; pos <= used; ++i, ++pos) + { + // Is this potentially a match? + if (array[pos] == *other) + { + // Check to see if we have a match. + u32 j; + for (j = 0; j < other_size; ++j) + { + if (array[pos + j] != other[j]) + break; + } + + // If we have a match, replace characters. + if (j == other_size) + { + for (j = 0; j < replace_size; ++j) + array[i + j] = replace[j]; + i += replace_size - 1; + pos += other_size - 1; + continue; + } + } + + // No match found, just copy characters. + array[i - 1] = array[pos]; + } + array[i] = 0; + used = i; + + return *this; + } + + // We are going to be adding characters, so the string size will increase. + // Count the number of times toReplace exists in the string so we can allocate the new size. + u32 find_count = 0; + s32 pos = 0; + while ((pos = find_raw(other, pos)) != -1) + { + ++find_count; + ++pos; + } + + // Re-allocate the string now, if needed. + u32 len = delta * find_count; + if (used + len >= allocated) + reallocate(used + len); + + // Start replacing. + pos = 0; + while ((pos = find_raw(other, pos)) != -1) + { + uchar16_t* start = array + pos + other_size - 1; + uchar16_t* ptr = array + used; + uchar16_t* end = array + used + delta; + + // Shift characters to make room for the string. + while (ptr != start) + { + *end = *ptr; + --ptr; + --end; + } + + // Add the new string now. + for (u32 i = 0; i < replace_size; ++i) + array[pos + i] = replace[i]; + + pos += replace_size; + used += delta; + } + + // Terminate the string and return ourself. + array[used] = 0; + return *this; + } + + + //! Removes characters from a ustring16.. + //! \param c The character to remove. + //! \return A reference to our current string. + ustring16& remove(uchar32_t c) + { + u32 pos = 0; + u32 found = 0; + u32 len = (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character. + for (u32 i=0; i<=used; ++i) + { + uchar32_t uc32 = 0; + if (!UTF16_IS_SURROGATE_HI(array[i])) + uc32 |= array[i]; + else if (i + 1 <= used) + { + // Convert the surrogate pair into a single UTF-32 character. + uc32 = unicode::toUTF32(array[i], array[i + 1]); + } + u32 len2 = (uc32 > 0xFFFF ? 2 : 1); + + if (uc32 == c) + { + found += len; + continue; + } + + array[pos++] = array[i]; + if (len2 == 2) + array[pos++] = array[++i]; + } + used -= found; + array[used] = 0; + return *this; + } + + + //! Removes a ustring16 from the ustring16. + //! \param toRemove The string to remove. + //! \return A reference to our current string. + ustring16& remove(const ustring16& toRemove) + { + u32 size = toRemove.size_raw(); + if (size == 0) return *this; + + const uchar16_t* tra = toRemove.c_str(); + u32 pos = 0; + u32 found = 0; + for (u32 i=0; i<=used; ++i) + { + u32 j = 0; + while (j < size) + { + if (array[i + j] != tra[j]) + break; + ++j; + } + if (j == size) + { + found += size; + i += size - 1; + continue; + } + + array[pos++] = array[i]; + } + used -= found; + array[used] = 0; + return *this; + } + + + //! Removes characters from the ustring16. + //! \param characters The characters to remove. + //! \return A reference to our current string. + ustring16& removeChars(const ustring16& characters) + { + if (characters.size_raw() == 0) + return *this; + + u32 pos = 0; + u32 found = 0; + const_iterator iter(characters); + for (u32 i=0; i<=used; ++i) + { + uchar32_t uc32 = 0; + if (!UTF16_IS_SURROGATE_HI(array[i])) + uc32 |= array[i]; + else if (i + 1 <= used) + { + // Convert the surrogate pair into a single UTF-32 character. + uc32 = unicode::toUTF32(array[i], array[i+1]); + } + u32 len2 = (uc32 > 0xFFFF ? 2 : 1); + + bool cont = false; + iter.toStart(); + while (!iter.atEnd()) + { + uchar32_t c = *iter; + if (uc32 == c) + { + found += (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character. + ++i; + cont = true; + break; + } + ++iter; + } + if (cont) continue; + + array[pos++] = array[i]; + if (len2 == 2) + array[pos++] = array[++i]; + } + used -= found; + array[used] = 0; + return *this; + } + + + //! Trims the ustring16. + //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16. + //! \param whitespace The characters that are to be considered as whitespace. + //! \return A reference to our current string. + ustring16& trim(const ustring16& whitespace = " \t\n\r") + { + core::array utf32white = whitespace.toUTF32(); + + // find start and end of the substring without the specified characters + const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1); + if (begin == -1) + return (*this=""); + + const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1); + + return (*this = subString(begin, (end +1) - begin)); + } + + + //! Erases a character from the ustring16. + //! May be slow, because all elements following after the erased element have to be copied. + //! \param index Index of element to be erased. + //! \return A reference to our current string. + ustring16& erase(u32 index) + { + _IRR_DEBUG_BREAK_IF(index>used) // access violation + + iterator i(*this, index); + + uchar32_t t = *i; + u32 len = (t > 0xFFFF ? 2 : 1); + + for (u32 j = static_cast(i.getPos()) + len; j <= used; ++j) + array[j - len] = array[j]; + + used -= len; + array[used] = 0; + + return *this; + } + + + //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination. + //! \return A reference to our current string. + ustring16& validate() + { + // Validate all unicode characters. + for (u32 i=0; i= allocated) || UTF16_IS_SURROGATE_LO(array[i])) + array[i] = unicode::UTF_REPLACEMENT_CHARACTER; + else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1])) + array[i] = unicode::UTF_REPLACEMENT_CHARACTER; + ++i; + } + if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF) + array[i] = unicode::UTF_REPLACEMENT_CHARACTER; + } + + // terminate + used = 0; + if (allocated > 0) + { + used = allocated - 1; + array[used] = 0; + } + return *this; + } + + + //! Gets the last char of the ustring16, or 0. + //! \return The last char of the ustring16, or 0. + uchar32_t lastChar() const + { + if (used < 1) + return 0; + + if (UTF16_IS_SURROGATE_LO(array[used-1])) + { + // Make sure we have a paired surrogate. + if (used < 2) + return 0; + + // Check for an invalid surrogate. + if (!UTF16_IS_SURROGATE_HI(array[used-2])) + return 0; + + // Convert the surrogate pair into a single UTF-32 character. + return unicode::toUTF32(array[used-2], array[used-1]); + } + else + { + return array[used-1]; + } + } + + + //! Split the ustring16 into parts. + /** This method will split a ustring16 at certain delimiter characters + into the container passed in as reference. The type of the container + has to be given as template parameter. It must provide a push_back and + a size method. + \param ret The result container + \param c C-style ustring16 of delimiter characters + \param count Number of delimiter characters + \param ignoreEmptyTokens Flag to avoid empty substrings in the result + container. If two delimiters occur without a character in between, an + empty substring would be placed in the result. If this flag is set, + only non-empty strings are stored. + \param keepSeparators Flag which allows to add the separator to the + result ustring16. If this flag is true, the concatenation of the + substrings results in the original ustring16. Otherwise, only the + characters between the delimiters are returned. + \return The number of resulting substrings + */ + template + u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const + { + if (!c) + return 0; + + const_iterator i(*this); + const u32 oldSize=ret.size(); + u32 pos = 0; + u32 lastpos = 0; + u32 lastpospos = 0; + bool lastWasSeparator = false; + while (!i.atEnd()) + { + uchar32_t ch = *i; + bool foundSeparator = false; + for (u32 j=0; j(&array[lastpospos], pos - lastpos)); + foundSeparator = true; + lastpos = (keepSeparators ? pos : pos + 1); + lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1); + break; + } + } + lastWasSeparator = foundSeparator; + ++pos; + ++i; + } + u32 s = size() + 1; + if (s > lastpos) + ret.push_back(ustring16(&array[lastpospos], s - lastpos)); + return ret.size()-oldSize; + } + + + //! Split the ustring16 into parts. + /** This method will split a ustring16 at certain delimiter characters + into the container passed in as reference. The type of the container + has to be given as template parameter. It must provide a push_back and + a size method. + \param ret The result container + \param c A unicode string of delimiter characters + \param ignoreEmptyTokens Flag to avoid empty substrings in the result + container. If two delimiters occur without a character in between, an + empty substring would be placed in the result. If this flag is set, + only non-empty strings are stored. + \param keepSeparators Flag which allows to add the separator to the + result ustring16. If this flag is true, the concatenation of the + substrings results in the original ustring16. Otherwise, only the + characters between the delimiters are returned. + \return The number of resulting substrings + */ + template + u32 split(container& ret, const ustring16& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const + { + core::array v = c.toUTF32(); + return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators); + } + + + //! Gets the size of the allocated memory buffer for the string. + //! \return The size of the allocated memory buffer. + u32 capacity() const + { + return allocated; + } + + + //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates. + //! \return The raw number of UTF-16 code points, excluding the trialing NUL. + u32 size_raw() const + { + return used; + } + + + //! Inserts a character into the string. + //! \param c The character to insert. + //! \param pos The position to insert the character. + //! \return A reference to our current string. + ustring16& insert(uchar32_t c, u32 pos) + { + u8 len = (c > 0xFFFF ? 2 : 1); + + if (used + len >= allocated) + reallocate(used + len); + + used += len; + + iterator iter(*this, pos); + for (u32 i = used - 2; i > iter.getPos(); --i) + array[i] = array[i - len]; + + if (c > 0xFFFF) + { + // c will be multibyte, so split it up into a surrogate pair. + uchar16_t x = static_cast(c); + uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10); + uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)); + array[iter.getPos()] = vh; + array[iter.getPos()+1] = vl; + } + else + { + array[iter.getPos()] = static_cast(c); + } + array[used] = 0; + return *this; + } + + + //! Inserts a string into the string. + //! \param c The string to insert. + //! \param pos The position to insert the string. + //! \return A reference to our current string. + ustring16& insert(const ustring16& c, u32 pos) + { + u32 len = c.size_raw(); + if (len == 0) return *this; + + if (used + len >= allocated) + reallocate(used + len); + + used += len; + + iterator iter(*this, pos); + for (u32 i = used - 2; i > iter.getPos() + len; --i) + array[i] = array[i - len]; + + const uchar16_t* s = c.c_str(); + for (u32 i = 0; i < len; ++i) + { + array[pos++] = *s; + ++s; + } + + array[used] = 0; + return *this; + } + + + //! Inserts a character into the string. + //! \param c The character to insert. + //! \param pos The position to insert the character. + //! \return A reference to our current string. + ustring16& insert_raw(uchar16_t c, u32 pos) + { + if (used + 1 >= allocated) + reallocate(used + 1); + + ++used; + + for (u32 i = used - 1; i > pos; --i) + array[i] = array[i - 1]; + + array[pos] = c; + array[used] = 0; + return *this; + } + + + //! Removes a character from string. + //! \param pos Position of the character to remove. + //! \return A reference to our current string. + ustring16& erase_raw(u32 pos) + { + for (u32 i=pos; i<=used; ++i) + { + array[i] = array[i + 1]; + } + --used; + array[used] = 0; + return *this; + } + + + //! Replaces a character in the string. + //! \param c The new character. + //! \param pos The position of the character to replace. + //! \return A reference to our current string. + ustring16& replace_raw(uchar16_t c, u32 pos) + { + array[pos] = c; + return *this; + } + + + //! Returns an iterator to the beginning of the string. + //! \return An iterator to the beginning of the string. + iterator begin() + { + iterator i(*this, 0); + return i; + } + + + //! Returns an iterator to the beginning of the string. + //! \return An iterator to the beginning of the string. + const_iterator begin() const + { + const_iterator i(*this, 0); + return i; + } + + + //! Returns an iterator to the beginning of the string. + //! \return An iterator to the beginning of the string. + const_iterator cbegin() const + { + const_iterator i(*this, 0); + return i; + } + + + //! Returns an iterator to the end of the string. + //! \return An iterator to the end of the string. + iterator end() + { + iterator i(*this, 0); + i.toEnd(); + return i; + } + + + //! Returns an iterator to the end of the string. + //! \return An iterator to the end of the string. + const_iterator end() const + { + const_iterator i(*this, 0); + i.toEnd(); + return i; + } + + + //! Returns an iterator to the end of the string. + //! \return An iterator to the end of the string. + const_iterator cend() const + { + const_iterator i(*this, 0); + i.toEnd(); + return i; + } + + + //! Converts the string to a UTF-8 encoded string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return A string containing the UTF-8 encoded string. + core::string toUTF8_s(const bool addBOM = false) const + { + core::string ret; + ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1); + const_iterator iter(*this, 0); + + // Add the byte order mark if the user wants it. + if (addBOM) + { + ret.append(unicode::BOM_ENCODE_UTF8[0]); + ret.append(unicode::BOM_ENCODE_UTF8[1]); + ret.append(unicode::BOM_ENCODE_UTF8[2]); + } + + while (!iter.atEnd()) + { + uchar32_t c = *iter; + if (c > 0xFFFF) + { // 4 bytes + uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7); + uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F); + uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F); + uchar8_t b4 = (0x2 << 6) | (c & 0x3F); + ret.append(b1); + ret.append(b2); + ret.append(b3); + ret.append(b4); + } + else if (c > 0x7FF) + { // 3 bytes + uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF); + uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F); + uchar8_t b3 = (0x2 << 6) | (c & 0x3F); + ret.append(b1); + ret.append(b2); + ret.append(b3); + } + else if (c > 0x7F) + { // 2 bytes + uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F); + uchar8_t b2 = (0x2 << 6) | (c & 0x3F); + ret.append(b1); + ret.append(b2); + } + else + { // 1 byte + ret.append(static_cast(c)); + } + ++iter; + } + return ret; + } + + + //! Converts the string to a UTF-8 encoded string array. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return An array containing the UTF-8 encoded string. + core::array toUTF8(const bool addBOM = false) const + { + core::array ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1); + const_iterator iter(*this, 0); + + // Add the byte order mark if the user wants it. + if (addBOM) + { + ret.push_back(unicode::BOM_ENCODE_UTF8[0]); + ret.push_back(unicode::BOM_ENCODE_UTF8[1]); + ret.push_back(unicode::BOM_ENCODE_UTF8[2]); + } + + while (!iter.atEnd()) + { + uchar32_t c = *iter; + if (c > 0xFFFF) + { // 4 bytes + uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7); + uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F); + uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F); + uchar8_t b4 = (0x2 << 6) | (c & 0x3F); + ret.push_back(b1); + ret.push_back(b2); + ret.push_back(b3); + ret.push_back(b4); + } + else if (c > 0x7FF) + { // 3 bytes + uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF); + uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F); + uchar8_t b3 = (0x2 << 6) | (c & 0x3F); + ret.push_back(b1); + ret.push_back(b2); + ret.push_back(b3); + } + else if (c > 0x7F) + { // 2 bytes + uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F); + uchar8_t b2 = (0x2 << 6) | (c & 0x3F); + ret.push_back(b1); + ret.push_back(b2); + } + else + { // 1 byte + ret.push_back(static_cast(c)); + } + ++iter; + } + ret.push_back(0); + return ret; + } + + +#ifdef USTRING_CPP0X_NEWLITERALS // C++0x + //! Converts the string to a UTF-16 encoded string. + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return A string containing the UTF-16 encoded string. + core::string toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { + core::string ret; + ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1); + + // Add the BOM if specified. + if (addBOM) + { + if (endian == unicode::EUTFEE_NATIVE) + ret[0] = unicode::BOM; + else if (endian == unicode::EUTFEE_LITTLE) + { + uchar8_t* ptr8 = reinterpret_cast(&ret[0]); + *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0]; + *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1]; + } + else + { + uchar8_t* ptr8 = reinterpret_cast(&ret[0]); + *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0]; + *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1]; + } + } + + ret.append(array); + if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian) + { + char16_t* ptr = ret.c_str(); + for (u32 i = 0; i < ret.size(); ++i) + *ptr++ = unicode::swapEndian16(*ptr); + } + return ret; + } +#endif + + + //! Converts the string to a UTF-16 encoded string array. + //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class. + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return An array containing the UTF-16 encoded string. + core::array toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { + core::array ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1); + uchar16_t* ptr = ret.pointer(); + + // Add the BOM if specified. + if (addBOM) + { + if (endian == unicode::EUTFEE_NATIVE) + *ptr = unicode::BOM; + else if (endian == unicode::EUTFEE_LITTLE) + { + uchar8_t* ptr8 = reinterpret_cast(ptr); + *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0]; + *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1]; + } + else + { + uchar8_t* ptr8 = reinterpret_cast(ptr); + *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0]; + *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1]; + } + ++ptr; + } + + memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t)); + if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian) + { + for (u32 i = 0; i <= used; ++i) + ptr[i] = unicode::swapEndian16(ptr[i]); + } + ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0)); + ret.push_back(0); + return ret; + } + + +#ifdef USTRING_CPP0X_NEWLITERALS // C++0x + //! Converts the string to a UTF-32 encoded string. + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return A string containing the UTF-32 encoded string. + core::string toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { + core::string ret; + ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0)); + const_iterator iter(*this, 0); + + // Add the BOM if specified. + if (addBOM) + { + if (endian == unicode::EUTFEE_NATIVE) + ret.append(unicode::BOM); + else + { + union + { + uchar32_t full; + u8 chunk[4]; + } t; + + if (endian == unicode::EUTFEE_LITTLE) + { + t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0]; + t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1]; + t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2]; + t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3]; + } + else + { + t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0]; + t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1]; + t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2]; + t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3]; + } + ret.append(t.full); + } + } + + while (!iter.atEnd()) + { + uchar32_t c = *iter; + if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian) + c = unicode::swapEndian32(c); + ret.append(c); + ++iter; + } + return ret; + } +#endif + + + //! Converts the string to a UTF-32 encoded string array. + //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class. + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return An array containing the UTF-32 encoded string. + core::array toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { + core::array ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1); + const_iterator iter(*this, 0); + + // Add the BOM if specified. + if (addBOM) + { + if (endian == unicode::EUTFEE_NATIVE) + ret.push_back(unicode::BOM); + else + { + union + { + uchar32_t full; + u8 chunk[4]; + } t; + + if (endian == unicode::EUTFEE_LITTLE) + { + t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0]; + t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1]; + t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2]; + t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3]; + } + else + { + t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0]; + t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1]; + t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2]; + t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3]; + } + ret.push_back(t.full); + } + } + ret.push_back(0); + + while (!iter.atEnd()) + { + uchar32_t c = *iter; + if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian) + c = unicode::swapEndian32(c); + ret.push_back(c); + ++iter; + } + return ret; + } + + + //! Converts the string to a wchar_t encoded string. + /** The size of a wchar_t changes depending on the platform. This function will store a + correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/ + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return A string containing the wchar_t encoded string. + core::string toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { + if (sizeof(wchar_t) == 4) + { + core::array a(toUTF32(endian, addBOM)); + core::stringw ret(a.pointer()); + return ret; + } + else if (sizeof(wchar_t) == 2) + { + if (endian == unicode::EUTFEE_NATIVE && addBOM == false) + { + core::stringw ret(array); + return ret; + } + else + { + core::array a(toUTF16(endian, addBOM)); + core::stringw ret(a.pointer()); + return ret; + } + } + else if (sizeof(wchar_t) == 1) + { + core::array a(toUTF8(addBOM)); + core::stringw ret(a.pointer()); + return ret; + } + + // Shouldn't happen. + return core::stringw(); + } + + + //! Converts the string to a wchar_t encoded string array. + /** The size of a wchar_t changes depending on the platform. This function will store a + correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/ + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return An array containing the wchar_t encoded string. + core::array toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { + if (sizeof(wchar_t) == 4) + { + core::array a(toUTF32(endian, addBOM)); + core::array ret(a.size()); + ret.set_used(a.size()); + memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t)); + return ret; + } + if (sizeof(wchar_t) == 2) + { + if (endian == unicode::EUTFEE_NATIVE && addBOM == false) + { + core::array ret(used); + ret.set_used(used); + memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t)); + return ret; + } + else + { + core::array a(toUTF16(endian, addBOM)); + core::array ret(a.size()); + ret.set_used(a.size()); + memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t)); + return ret; + } + } + if (sizeof(wchar_t) == 1) + { + core::array a(toUTF8(addBOM)); + core::array ret(a.size()); + ret.set_used(a.size()); + memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t)); + return ret; + } + + // Shouldn't happen. + return core::array(); + } + + //! Converts the string to a properly encoded io::path string. + //! \param endian The desired endianness of the string. + //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string. + //! \return An io::path string containing the properly encoded string. + io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const + { +#if defined(_IRR_WCHAR_FILESYSTEM) + return toWCHAR_s(endian, addBOM); +#else + return toUTF8_s(addBOM); +#endif + } + + //! Loads an unknown stream of data. + //! Will attempt to determine if the stream is unicode data. Useful for loading from files. + //! \param data The data stream to load from. + //! \param data_size The length of the data string. + //! \return A reference to our current string. + ustring16& loadDataStream(const char* data, size_t data_size) + { + // Clear our string. + *this = ""; + if (!data) + return *this; + + unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data); + switch (e) + { + default: + case unicode::EUTFE_UTF8: + append((uchar8_t*)data, data_size); + break; + + case unicode::EUTFE_UTF16: + case unicode::EUTFE_UTF16_BE: + case unicode::EUTFE_UTF16_LE: + append((uchar16_t*)data, data_size / 2); + break; + + case unicode::EUTFE_UTF32: + case unicode::EUTFE_UTF32_BE: + case unicode::EUTFE_UTF32_LE: + append((uchar32_t*)data, data_size / 4); + break; + } + + return *this; + } + + //! Gets the encoding of the Unicode string this class contains. + //! \return An enum describing the current encoding of this string. + const unicode::EUTF_ENCODE getEncoding() const + { + return encoding; + } + + //! Gets the endianness of the Unicode string this class contains. + //! \return An enum describing the endianness of this string. + const unicode::EUTF_ENDIAN getEndianness() const + { + if (encoding == unicode::EUTFE_UTF16_LE || + encoding == unicode::EUTFE_UTF32_LE) + return unicode::EUTFEE_LITTLE; + else return unicode::EUTFEE_BIG; + } + +private: + + //! Reallocate the string, making it bigger or smaller. + //! \param new_size The new size of the string. + void reallocate(u32 new_size) + { + uchar16_t* old_array = array; + + array = allocator.allocate(new_size + 1); //new u16[new_size]; + allocated = new_size + 1; + if (old_array == 0) return; + + u32 amount = used < new_size ? used : new_size; + for (u32 i=0; i<=amount; ++i) + array[i] = old_array[i]; + + if (allocated <= used) + used = allocated - 1; + + array[used] = 0; + + allocator.deallocate(old_array); // delete [] old_array; + } + + //--- member variables + + uchar16_t* array; + unicode::EUTF_ENCODE encoding; + u32 allocated; + u32 used; + TAlloc allocator; + //irrAllocator allocator; +}; + +typedef ustring16 > ustring; + + +//! Appends two ustring16s. +template +inline ustring16 operator+(const ustring16& left, const ustring16& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a null-terminated unicode string. +template +inline ustring16 operator+(const ustring16& left, const B* const right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a null-terminated unicode string. +template +inline ustring16 operator+(const B* const left, const ustring16& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and an Irrlicht string. +template +inline ustring16 operator+(const ustring16& left, const string& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and an Irrlicht string. +template +inline ustring16 operator+(const string& left, const ustring16& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a std::basic_string. +template +inline ustring16 operator+(const ustring16& left, const std::basic_string& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a std::basic_string. +template +inline ustring16 operator+(const std::basic_string& left, const ustring16& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a char. +template +inline ustring16 operator+(const ustring16& left, const char right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a char. +template +inline ustring16 operator+(const char left, const ustring16& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +#ifdef USTRING_CPP0X_NEWLITERALS +//! Appends a ustring16 and a uchar32_t. +template +inline ustring16 operator+(const ustring16& left, const uchar32_t right) +{ + ustring16 ret(left); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a uchar32_t. +template +inline ustring16 operator+(const uchar32_t left, const ustring16& right) +{ + ustring16 ret(left); + ret += right; + return ret; +} +#endif + + +//! Appends a ustring16 and a short. +template +inline ustring16 operator+(const ustring16& left, const short right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and a short. +template +inline ustring16 operator+(const short left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and an unsigned short. +template +inline ustring16 operator+(const ustring16& left, const unsigned short right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and an unsigned short. +template +inline ustring16 operator+(const unsigned short left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and an int. +template +inline ustring16 operator+(const ustring16& left, const int right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and an int. +template +inline ustring16 operator+(const int left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and an unsigned int. +template +inline ustring16 operator+(const ustring16& left, const unsigned int right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and an unsigned int. +template +inline ustring16 operator+(const unsigned int left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a long. +template +inline ustring16 operator+(const ustring16& left, const long right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and a long. +template +inline ustring16 operator+(const long left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and an unsigned long. +template +inline ustring16 operator+(const ustring16& left, const unsigned long right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and an unsigned long. +template +inline ustring16 operator+(const unsigned long left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a float. +template +inline ustring16 operator+(const ustring16& left, const float right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and a float. +template +inline ustring16 operator+(const float left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +//! Appends a ustring16 and a double. +template +inline ustring16 operator+(const ustring16& left, const double right) +{ + ustring16 ret(left); + ret += core::stringc(right); + return ret; +} + + +//! Appends a ustring16 and a double. +template +inline ustring16 operator+(const double left, const ustring16& right) +{ + ustring16 ret((core::stringc(left))); + ret += right; + return ret; +} + + +#ifdef USTRING_CPP0X +//! Appends two ustring16s. +template +inline ustring16&& operator+(const ustring16& left, ustring16&& right) +{ + //std::cout << "MOVE operator+(&, &&)" << std::endl; + right.insert(left, 0); + return std::move(right); +} + + +//! Appends two ustring16s. +template +inline ustring16&& operator+(ustring16&& left, const ustring16& right) +{ + //std::cout << "MOVE operator+(&&, &)" << std::endl; + left.append(right); + return std::move(left); +} + + +//! Appends two ustring16s. +template +inline ustring16&& operator+(ustring16&& left, ustring16&& right) +{ + //std::cout << "MOVE operator+(&&, &&)" << std::endl; + if ((right.size_raw() <= left.capacity() - left.size_raw()) || + (right.capacity() - right.size_raw() < left.size_raw())) + { + left.append(right); + return std::move(left); + } + else + { + right.insert(left, 0); + return std::move(right); + } +} + + +//! Appends a ustring16 and a null-terminated unicode string. +template +inline ustring16&& operator+(ustring16&& left, const B* const right) +{ + //std::cout << "MOVE operator+(&&, B*)" << std::endl; + left.append(right); + return std::move(left); +} + + +//! Appends a ustring16 and a null-terminated unicode string. +template +inline ustring16&& operator+(const B* const left, ustring16&& right) +{ + //std::cout << "MOVE operator+(B*, &&)" << std::endl; + right.insert(left, 0); + return std::move(right); +} + + +//! Appends a ustring16 and an Irrlicht string. +template +inline ustring16&& operator+(const string& left, ustring16&& right) +{ + //std::cout << "MOVE operator+(&, &&)" << std::endl; + right.insert(left, 0); + return std::move(right); +} + + +//! Appends a ustring16 and an Irrlicht string. +template +inline ustring16&& operator+(ustring16&& left, const string& right) +{ + //std::cout << "MOVE operator+(&&, &)" << std::endl; + left.append(right); + return std::move(left); +} + + +//! Appends a ustring16 and a std::basic_string. +template +inline ustring16&& operator+(const std::basic_string& left, ustring16&& right) +{ + //std::cout << "MOVE operator+(&, &&)" << std::endl; + right.insert(core::ustring16(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and a std::basic_string. +template +inline ustring16&& operator+(ustring16&& left, const std::basic_string& right) +{ + //std::cout << "MOVE operator+(&&, &)" << std::endl; + left.append(right); + return std::move(left); +} + + +//! Appends a ustring16 and a char. +template +inline ustring16 operator+(ustring16&& left, const char right) +{ + left.append((uchar32_t)right); + return std::move(left); +} + + +//! Appends a ustring16 and a char. +template +inline ustring16 operator+(const char left, ustring16&& right) +{ + right.insert((uchar32_t)left, 0); + return std::move(right); +} + + +#ifdef USTRING_CPP0X_NEWLITERALS +//! Appends a ustring16 and a uchar32_t. +template +inline ustring16 operator+(ustring16&& left, const uchar32_t right) +{ + left.append(right); + return std::move(left); +} + + +//! Appends a ustring16 and a uchar32_t. +template +inline ustring16 operator+(const uchar32_t left, ustring16&& right) +{ + right.insert(left, 0); + return std::move(right); +} +#endif + + +//! Appends a ustring16 and a short. +template +inline ustring16 operator+(ustring16&& left, const short right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and a short. +template +inline ustring16 operator+(const short left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and an unsigned short. +template +inline ustring16 operator+(ustring16&& left, const unsigned short right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and an unsigned short. +template +inline ustring16 operator+(const unsigned short left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and an int. +template +inline ustring16 operator+(ustring16&& left, const int right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and an int. +template +inline ustring16 operator+(const int left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and an unsigned int. +template +inline ustring16 operator+(ustring16&& left, const unsigned int right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and an unsigned int. +template +inline ustring16 operator+(const unsigned int left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and a long. +template +inline ustring16 operator+(ustring16&& left, const long right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and a long. +template +inline ustring16 operator+(const long left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and an unsigned long. +template +inline ustring16 operator+(ustring16&& left, const unsigned long right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and an unsigned long. +template +inline ustring16 operator+(const unsigned long left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and a float. +template +inline ustring16 operator+(ustring16&& left, const float right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and a float. +template +inline ustring16 operator+(const float left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} + + +//! Appends a ustring16 and a double. +template +inline ustring16 operator+(ustring16&& left, const double right) +{ + left.append(core::stringc(right)); + return std::move(left); +} + + +//! Appends a ustring16 and a double. +template +inline ustring16 operator+(const double left, ustring16&& right) +{ + right.insert(core::stringc(left), 0); + return std::move(right); +} +#endif + + +#ifndef USTRING_NO_STL +//! Writes a ustring16 to an ostream. +template +inline std::ostream& operator<<(std::ostream& out, const ustring16& in) +{ + out << in.toUTF8_s().c_str(); + return out; +} + +//! Writes a ustring16 to a wostream. +template +inline std::wostream& operator<<(std::wostream& out, const ustring16& in) +{ + out << in.toWCHAR_s().c_str(); + return out; +} +#endif + + +#ifndef USTRING_NO_STL + +namespace unicode +{ + +//! Hashing algorithm for hashing a ustring. Used for things like unordered_maps. +//! Algorithm taken from std::hash. +class hash : public std::unary_function +{ + public: + size_t operator()(const core::ustring& s) const + { + size_t ret = 2166136261U; + size_t index = 0; + size_t stride = 1 + s.size_raw() / 10; + + core::ustring::const_iterator i = s.begin(); + while (i != s.end()) + { + // TODO: Don't force u32 on an x64 OS. Make it agnostic. + ret = 16777619U * ret ^ (size_t)s[(u32)index]; + index += stride; + i += stride; + } + return (ret); + } +}; + +} // end namespace unicode + +#endif + +} // end namespace core +} // end namespace irr