mirror of
https://github.com/DarkflameUniverse/DarkflameServer
synced 2024-08-30 18:43:58 +00:00
Better Unicode support in GeneralUtils (#658)
* ASCIIToUTF16: output replacement character instead of failing assert * Add GeneralUtils::_NextUTF8Char * Implement GeneralUtils::UTF8ToUTF16 * use string_view everywhere * use string_view::front instead of begin * Add PushUTF16CodePoint
This commit is contained in:
parent
e97ae92624
commit
9813c3ed2c
@ -6,7 +6,7 @@
|
||||
#include <algorithm>
|
||||
|
||||
template <typename T>
|
||||
inline size_t MinSize(size_t size, const std::basic_string<T>& string) {
|
||||
inline size_t MinSize(size_t size, const std::basic_string_view<T>& string) {
|
||||
if (size == size_t(-1) || size > string.size()) {
|
||||
return string.size();
|
||||
} else {
|
||||
@ -24,7 +24,7 @@ inline bool IsTrailSurrogate(char16_t c) {
|
||||
|
||||
inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
|
||||
if (cp <= 0x007F) {
|
||||
ret.push_back(cp);
|
||||
ret.push_back(static_cast<uint8_t>(cp));
|
||||
} else if (cp <= 0x07FF) {
|
||||
ret.push_back(0xC0 | (cp >> 6));
|
||||
ret.push_back(0x80 | (cp & 0x3F));
|
||||
@ -42,16 +42,123 @@ inline void PushUTF8CodePoint(std::string& ret, char32_t cp) {
|
||||
}
|
||||
}
|
||||
|
||||
constexpr const char16_t REPLACEMENT_CHARACTER = 0xFFFD;
|
||||
|
||||
bool _IsSuffixChar(uint8_t c) {
|
||||
return (c & 0xC0) == 0x80;
|
||||
}
|
||||
|
||||
bool GeneralUtils::_NextUTF8Char(std::string_view& slice, uint32_t& out) {
|
||||
size_t rem = slice.length();
|
||||
const uint8_t* bytes = (const uint8_t*) &slice.front();
|
||||
if (rem > 0) {
|
||||
uint8_t first = bytes[0];
|
||||
if (first < 0x80) { // 1 byte character
|
||||
out = static_cast<uint32_t>(first & 0x7F);
|
||||
slice.remove_prefix(1);
|
||||
return true;
|
||||
} else if (first < 0xC0) {
|
||||
// middle byte, not valid at start, fall through
|
||||
} else if (first < 0xE0) { // two byte character
|
||||
if (rem > 1) {
|
||||
uint8_t second = bytes[1];
|
||||
if (_IsSuffixChar(second)) {
|
||||
out = (static_cast<uint32_t>(first & 0x1F) << 6)
|
||||
+ static_cast<uint32_t>(second & 0x3F);
|
||||
slice.remove_prefix(2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} else if (first < 0xF0) { // three byte character
|
||||
if (rem > 2) {
|
||||
uint8_t second = bytes[1];
|
||||
uint8_t third = bytes[2];
|
||||
if (_IsSuffixChar(second) && _IsSuffixChar(third)) {
|
||||
out = (static_cast<uint32_t>(first & 0x0F) << 12)
|
||||
+ (static_cast<uint32_t>(second & 0x3F) << 6)
|
||||
+ static_cast<uint32_t>(third & 0x3F);
|
||||
slice.remove_prefix(3);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} else if (first < 0xF8) { // four byte character
|
||||
if (rem > 3) {
|
||||
uint8_t second = bytes[1];
|
||||
uint8_t third = bytes[2];
|
||||
uint8_t fourth = bytes[3];
|
||||
if (_IsSuffixChar(second) && _IsSuffixChar(third) && _IsSuffixChar(fourth)) {
|
||||
out = (static_cast<uint32_t>(first & 0x07) << 18)
|
||||
+ (static_cast<uint32_t>(second & 0x3F) << 12)
|
||||
+ (static_cast<uint32_t>(third & 0x3F) << 6)
|
||||
+ static_cast<uint32_t>(fourth & 0x3F);
|
||||
slice.remove_prefix(4);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
out = static_cast<uint32_t>(REPLACEMENT_CHARACTER);
|
||||
slice.remove_prefix(1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// See <https://www.ietf.org/rfc/rfc2781.html#section-2.1>
|
||||
bool PushUTF16CodePoint(std::u16string& output, uint32_t U, size_t size) {
|
||||
if (output.length() >= size) return false;
|
||||
if (U < 0x10000) {
|
||||
// If U < 0x10000, encode U as a 16-bit unsigned integer and terminate.
|
||||
output.push_back(static_cast<uint16_t>(U));
|
||||
return true;
|
||||
} else if (U > 0x10FFFF) {
|
||||
output.push_back(REPLACEMENT_CHARACTER);
|
||||
return true;
|
||||
} else if (output.length() + 1 < size) {
|
||||
// Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF,
|
||||
// U' must be less than or equal to 0xFFFFF. That is, U' can be
|
||||
// represented in 20 bits.
|
||||
uint32_t Ut = U - 0x10000;
|
||||
|
||||
// Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and
|
||||
// 0xDC00, respectively. These integers each have 10 bits free to
|
||||
// encode the character value, for a total of 20 bits.
|
||||
uint16_t W1 = 0xD800;
|
||||
uint16_t W2 = 0xDC00;
|
||||
|
||||
// Assign the 10 high-order bits of the 20-bit U' to the 10 low-order
|
||||
// bits of W1 and the 10 low-order bits of U' to the 10 low-order
|
||||
// bits of W2.
|
||||
W1 += static_cast<uint16_t>((Ut & 0x3FC00) >> 10);
|
||||
W2 += static_cast<uint16_t>((Ut & 0x3FF) >> 0);
|
||||
|
||||
// Terminate.
|
||||
output.push_back(W1); // high surrogate
|
||||
output.push_back(W2); // low surrogate
|
||||
return true;
|
||||
} else return false;
|
||||
}
|
||||
|
||||
std::u16string GeneralUtils::UTF8ToUTF16(const std::string_view& string, size_t size) {
|
||||
size_t newSize = MinSize(size, string);
|
||||
std::u16string output;
|
||||
output.reserve(newSize);
|
||||
std::string_view iterator = string;
|
||||
|
||||
uint32_t c;
|
||||
while (_NextUTF8Char(iterator, c) && PushUTF16CodePoint(output, c, size)) {}
|
||||
return output;
|
||||
}
|
||||
|
||||
//! Converts an std::string (ASCII) to UCS-2 / UTF-16
|
||||
std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size) {
|
||||
std::u16string GeneralUtils::ASCIIToUTF16(const std::string_view& string, size_t size) {
|
||||
size_t newSize = MinSize(size, string);
|
||||
std::u16string ret;
|
||||
ret.reserve(newSize);
|
||||
|
||||
for (size_t i = 0; i < newSize; i++) {
|
||||
char c = string[i];
|
||||
assert(c > 0 && c <= 127);
|
||||
ret.push_back(static_cast<char16_t>(c));
|
||||
// Note: both 7-bit ascii characters and REPLACEMENT_CHARACTER fit in one char16_t
|
||||
ret.push_back((c > 0 && c <= 127) ? static_cast<char16_t>(c) : REPLACEMENT_CHARACTER);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -59,7 +166,7 @@ std::u16string GeneralUtils::ASCIIToUTF16(const std::string& string, size_t size
|
||||
|
||||
//! Converts a (potentially-ill-formed) UTF-16 string to UTF-8
|
||||
//! See: <http://simonsapin.github.io/wtf-8/#decoding-ill-formed-utf-16>
|
||||
std::string GeneralUtils::UTF16ToWTF8(const std::u16string& string, size_t size) {
|
||||
std::string GeneralUtils::UTF16ToWTF8(const std::u16string_view& string, size_t size) {
|
||||
size_t newSize = MinSize(size, string);
|
||||
std::string ret;
|
||||
ret.reserve(newSize);
|
||||
|
@ -26,7 +26,18 @@ namespace GeneralUtils {
|
||||
\param size A size to trim the string to. Default is -1 (No trimming)
|
||||
\return An UTF-16 representation of the string
|
||||
*/
|
||||
std::u16string ASCIIToUTF16(const std::string& string, size_t size = -1);
|
||||
std::u16string ASCIIToUTF16(const std::string_view& string, size_t size = -1);
|
||||
|
||||
//! Converts a UTF-8 String to a UTF-16 string
|
||||
/*!
|
||||
\param string The string to convert
|
||||
\param size A size to trim the string to. Default is -1 (No trimming)
|
||||
\return An UTF-16 representation of the string
|
||||
*/
|
||||
std::u16string UTF8ToUTF16(const std::string_view& string, size_t size = -1);
|
||||
|
||||
//! Internal, do not use
|
||||
bool _NextUTF8Char(std::string_view& slice, uint32_t& out);
|
||||
|
||||
//! Converts a UTF-16 string to a UTF-8 string
|
||||
/*!
|
||||
@ -34,7 +45,7 @@ namespace GeneralUtils {
|
||||
\param size A size to trim the string to. Default is -1 (No trimming)
|
||||
\return An UTF-8 representation of the string
|
||||
*/
|
||||
std::string UTF16ToWTF8(const std::u16string& string, size_t size = -1);
|
||||
std::string UTF16ToWTF8(const std::u16string_view& string, size_t size = -1);
|
||||
|
||||
/**
|
||||
* Compares two basic strings but does so ignoring case sensitivity
|
||||
|
@ -4,6 +4,7 @@ create_test_sourcelist (Tests
|
||||
AMFDeserializeTests.cpp
|
||||
TestNiPoint3.cpp
|
||||
TestLDFFormat.cpp
|
||||
TestEncoding.cpp
|
||||
)
|
||||
|
||||
# add the executable
|
||||
|
52
tests/TestEncoding.cpp
Normal file
52
tests/TestEncoding.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "GeneralUtils.h"
|
||||
#include "CommonCxxTests.h"
|
||||
|
||||
int TestEncoding(int argc, char* *const argv) {
|
||||
std::string x = "Hello World!";
|
||||
std::string_view v(x);
|
||||
|
||||
uint32_t out;
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'H');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'e');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'l');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 'o');
|
||||
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), true);
|
||||
|
||||
x = u8"Frühling";
|
||||
v = x;
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'F');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'r');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'ü');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'h');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'l');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'i');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'n');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'g');
|
||||
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
|
||||
|
||||
x = "中文字";
|
||||
v = x;
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'中');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'文');
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, U'字');
|
||||
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
|
||||
|
||||
x = "👨⚖️";
|
||||
v = x;
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x1F468);
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x200D);
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0x2696);
|
||||
GeneralUtils::_NextUTF8Char(v, out); ASSERT_EQ(out, 0xFE0F);
|
||||
ASSERT_EQ(GeneralUtils::_NextUTF8Char(v, out), false);
|
||||
|
||||
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Hello World!"), u"Hello World!");
|
||||
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("Frühling"), u"Frühling");
|
||||
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("中文字"), u"中文字");
|
||||
ASSERT_EQ(GeneralUtils::UTF8ToUTF16("👨⚖️"), u"👨⚖️");
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user