diff --git a/dCommon/GeneralUtils.cpp b/dCommon/GeneralUtils.cpp index 01306226..eafa6a3b 100644 --- a/dCommon/GeneralUtils.cpp +++ b/dCommon/GeneralUtils.cpp @@ -6,7 +6,7 @@ #include template -inline size_t MinSize(size_t size, const std::basic_string& string) { +inline size_t MinSize(size_t size, const std::basic_string_view& string) { if (size == size_t(-1) || size > string.size()) { return string.size(); } else { @@ -24,7 +24,7 @@ inline bool IsTrailSurrogate(char16_t c) { inline void PushUTF8CodePoint(std::string& ret, char32_t cp) { if (cp <= 0x007F) { - ret.push_back(cp); + ret.push_back(static_cast(cp)); } else if (cp <= 0x07FF) { ret.push_back(0xC0 | (cp >> 6)); ret.push_back(0x80 | (cp & 0x3F)); @@ -42,16 +42,123 @@ inline void PushUTF8CodePoint(std::string& ret, char32_t cp) { } } +constexpr const char16_t REPLACEMENT_CHARACTER = 0xFFFD; + +bool _IsSuffixChar(uint8_t c) { + return (c & 0xC0) == 0x80; +} + +bool GeneralUtils::_NextUTF8Char(std::string_view& slice, uint32_t& out) { + size_t rem = slice.length(); + const uint8_t* bytes = (const uint8_t*) &slice.front(); + if (rem > 0) { + uint8_t first = bytes[0]; + if (first < 0x80) { // 1 byte character + out = static_cast(first & 0x7F); + slice.remove_prefix(1); + return true; + } else if (first < 0xC0) { + // middle byte, not valid at start, fall through + } else if (first < 0xE0) { // two byte character + if (rem > 1) { + uint8_t second = bytes[1]; + if (_IsSuffixChar(second)) { + out = (static_cast(first & 0x1F) << 6) + + static_cast(second & 0x3F); + slice.remove_prefix(2); + return true; + } + } + } else if (first < 0xF0) { // three byte character + if (rem > 2) { + uint8_t second = bytes[1]; + uint8_t third = bytes[2]; + if (_IsSuffixChar(second) && _IsSuffixChar(third)) { + out = (static_cast(first & 0x0F) << 12) + + (static_cast(second & 0x3F) << 6) + + static_cast(third & 0x3F); + slice.remove_prefix(3); + return true; + } + } + } else if (first < 0xF8) { // four byte character + if (rem > 3) { + uint8_t second = bytes[1]; + uint8_t third = bytes[2]; + uint8_t fourth = bytes[3]; + if (_IsSuffixChar(second) && _IsSuffixChar(third) && _IsSuffixChar(fourth)) { + out = (static_cast(first & 0x07) << 18) + + (static_cast(second & 0x3F) << 12) + + (static_cast(third & 0x3F) << 6) + + static_cast(fourth & 0x3F); + slice.remove_prefix(4); + return true; + } + } + } + out = static_cast(REPLACEMENT_CHARACTER); + slice.remove_prefix(1); + return true; + } + return false; +} + +/// See +bool PushUTF16CodePoint(std::u16string& output, uint32_t U, size_t size) { + if (output.length() >= size) return false; + if (U < 0x10000) { + // If U < 0x10000, encode U as a 16-bit unsigned integer and terminate. + output.push_back(static_cast(U)); + return true; + } else if (U > 0x10FFFF) { + output.push_back(REPLACEMENT_CHARACTER); + return true; + } else if (output.length() + 1 < size) { + // Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF, + // U' must be less than or equal to 0xFFFFF. That is, U' can be + // represented in 20 bits. + uint32_t Ut = U - 0x10000; + + // Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and + // 0xDC00, respectively. These integers each have 10 bits free to + // encode the character value, for a total of 20 bits. + uint16_t W1 = 0xD800; + uint16_t W2 = 0xDC00; + + // Assign the 10 high-order bits of the 20-bit U' to the 10 low-order + // bits of W1 and the 10 low-order bits of U' to the 10 low-order + // bits of W2. + W1 += static_cast((Ut & 0x3FC00) >> 10); + W2 += static_cast((Ut & 0x3FF) >> 0); + + // Terminate. + output.push_back(W1); // high surrogate + output.push_back(W2); // low surrogate + return true; + } else return false; +} + +std::u16string GeneralUtils::UTF8ToUTF16(const std::string_view& string, size_t size) { + size_t newSize = MinSize(size, string); + std::u16string output; + output.reserve(newSize); + std::string_view iterator = string; + + uint32_t c; + while (_NextUTF8Char(iterator, c) && PushUTF16CodePoint(output, c, size)) {} + return output; +} + //! Converts an std::string (ASCII) to UCS-2 / UTF-16 -std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size) { +std::u16string GeneralUtils::ASCIIToUTF16(const std::string_view& string, size_t size) { size_t newSize = MinSize(size, string); std::u16string ret; ret.reserve(newSize); for (size_t i = 0; i < newSize; i++) { char c = string[i]; - assert(c > 0 && c <= 127); - ret.push_back(static_cast(c)); + // Note: both 7-bit ascii characters and REPLACEMENT_CHARACTER fit in one char16_t + ret.push_back((c > 0 && c <= 127) ? static_cast(c) : REPLACEMENT_CHARACTER); } return ret; @@ -59,7 +166,7 @@ std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size //! Converts a (potentially-ill-formed) UTF-16 string to UTF-8 //! See: -std::string GeneralUtils::UTF16ToWTF8(const std::u16string& string, size_t size) { +std::string GeneralUtils::UTF16ToWTF8(const std::u16string_view& string, size_t size) { size_t newSize = MinSize(size, string); std::string ret; ret.reserve(newSize); diff --git a/dCommon/GeneralUtils.h b/dCommon/GeneralUtils.h index 4973201e..f796839c 100644 --- a/dCommon/GeneralUtils.h +++ b/dCommon/GeneralUtils.h @@ -26,7 +26,18 @@ namespace GeneralUtils { \param size A size to trim the string to. Default is -1 (No trimming) \return An UTF-16 representation of the string */ - std::u16string ASCIIToUTF16(const std::string& string, size_t size = -1); + std::u16string ASCIIToUTF16(const std::string_view& string, size_t size = -1); + + //! Converts a UTF-8 String to a UTF-16 string + /*! + \param string The string to convert + \param size A size to trim the string to. Default is -1 (No trimming) + \return An UTF-16 representation of the string + */ + std::u16string UTF8ToUTF16(const std::string_view& string, size_t size = -1); + + //! Internal, do not use + bool _NextUTF8Char(std::string_view& slice, uint32_t& out); //! Converts a UTF-16 string to a UTF-8 string /*! @@ -34,7 +45,7 @@ namespace GeneralUtils { \param size A size to trim the string to. Default is -1 (No trimming) \return An UTF-8 representation of the string */ - std::string UTF16ToWTF8(const std::u16string& string, size_t size = -1); + std::string UTF16ToWTF8(const std::u16string_view& string, size_t size = -1); /** * Compares two basic strings but does so ignoring case sensitivity diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9c6e57d3..fb1ed5ac 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,6 +4,7 @@ create_test_sourcelist (Tests AMFDeserializeTests.cpp TestNiPoint3.cpp TestLDFFormat.cpp + TestEncoding.cpp ) # add the executable diff --git a/tests/TestEncoding.cpp b/tests/TestEncoding.cpp new file mode 100644 index 00000000..1e676ec3 --- /dev/null +++ b/tests/TestEncoding.cpp @@ -0,0 +1,52 @@ +#include +#include + +#include "GeneralUtils.h" +#include "CommonCxxTests.h" + +int TestEncoding(int argc, char* *const argv) { + std::string x = "Hello World!"; + std::string_view v(x); + + uint32_t out; + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'H'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'e'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'o'); + ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), true); + + x = u8"Frühling"; + v = x; + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'F'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'r'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'ü'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'h'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'l'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'i'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'n'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'g'); + ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false); + + x = "中文字"; + v = x; + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'中'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'文'); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'字'); + ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false); + + x = "👨‍⚖️"; + v = x; + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x1F468); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x200D); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x2696); + GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0xFE0F); + ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false); + + ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Hello World!"), u"Hello World!"); + ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Frühling"), u"Frühling"); + ASSERT_EQ(GeneralUtils::UTF8ToUTF16("中文字"), u"中文字"); + ASSERT_EQ(GeneralUtils::UTF8ToUTF16("👨‍⚖️"), u"👨‍⚖️"); + + return 0; +}