diff options
author | Andrey Lihatskiy <alihatskiy@productengine.com> | 2024-03-04 18:01:05 +0200 |
---|---|---|
committer | Andrey Lihatskiy <alihatskiy@productengine.com> | 2024-03-04 18:01:05 +0200 |
commit | b68a05e7c30930976ed1273b4c7a0fec01d2a84f (patch) | |
tree | 03ababecc49376c0e607b3548d9beb596f79c3a7 /indra/llcommon/llstring.cpp | |
parent | d47c6536820d1ed6e373147678dd0fab90e80ab8 (diff) | |
parent | 701d1a33bb8227aa55a71f48caeb30a453e77ee0 (diff) |
Merge branch 'main' into marchcat/x-merge
# Conflicts:
# indra/llcommon/llstring.cpp
# indra/llcommon/llstring.h
Diffstat (limited to 'indra/llcommon/llstring.cpp')
-rw-r--r-- | indra/llcommon/llstring.cpp | 224 |
1 files changed, 167 insertions, 57 deletions
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp index 9a2251e0a7..4aa54bb12d 100644 --- a/indra/llcommon/llstring.cpp +++ b/indra/llcommon/llstring.cpp @@ -30,6 +30,7 @@ #include "llerror.h" #include "llfasttimer.h" #include "llsd.h" +#include <unicode/uchar.h> #include <vector> #if LL_WINDOWS @@ -338,8 +339,6 @@ S32 wchar_utf8_length(const llwchar wc) { if (wc < 0x80) { - // This case will also catch negative values which are - // technically invalid. return 1; } else if (wc < 0x800) @@ -364,6 +363,30 @@ S32 wchar_utf8_length(const llwchar wc) } } +std::string wchar_utf8_preview(const llwchar wc) +{ + std::ostringstream oss; + oss << std::hex << std::uppercase << (U32)wc; + + U8 out_bytes[8]; + U32 size = (U32)wchar_to_utf8chars(wc, (char*)out_bytes); + + if (size > 1) + { + oss << " ["; + for (U32 i = 0; i < size; ++i) + { + if (i) + { + oss << ", "; + } + oss << (int)out_bytes[i]; + } + oss << "]"; + } + + return oss.str(); +} S32 wstring_utf8_length(const LLWString& wstr) { @@ -622,6 +645,119 @@ std::string utf8str_removeCRLF(const std::string& utf8str) return out; } +llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length) +{ + switch (length) + { + case 2: + return ((utf8str[offset] & 0x1F) << 6) + + (utf8str[offset + 1] & 0x3F); + case 3: + return ((utf8str[offset] & 0x0F) << 12) + + ((utf8str[offset + 1] & 0x3F) << 6) + + (utf8str[offset + 2] & 0x3F); + case 4: + return ((utf8str[offset] & 0x07) << 18) + + ((utf8str[offset + 1] & 0x3F) << 12) + + ((utf8str[offset + 2] & 0x3F) << 6) + + (utf8str[offset + 3] & 0x3F); + case 5: + return ((utf8str[offset] & 0x03) << 24) + + ((utf8str[offset + 1] & 0x3F) << 18) + + ((utf8str[offset + 2] & 0x3F) << 12) + + ((utf8str[offset + 3] & 0x3F) << 6) + + (utf8str[offset + 4] & 0x3F); + case 6: + return ((utf8str[offset] & 0x01) << 30) + + ((utf8str[offset + 1] & 0x3F) << 24) + + ((utf8str[offset + 2] & 0x3F) << 18) + + ((utf8str[offset + 3] & 0x3F) << 12) + + ((utf8str[offset + 4] & 0x3F) << 6) + + (utf8str[offset + 5] & 0x3F); + case 7: + return ((utf8str[offset + 1] & 0x03) << 30) + + ((utf8str[offset + 2] & 0x3F) << 24) + + ((utf8str[offset + 3] & 0x3F) << 18) + + ((utf8str[offset + 4] & 0x3F) << 12) + + ((utf8str[offset + 5] & 0x3F) << 6) + + (utf8str[offset + 6] & 0x3F); + } + return LL_UNKNOWN_CHAR; +} + +std::string utf8str_showBytesUTF8(const std::string& utf8str) +{ + std::string result; + + bool in_sequence = false; + size_t sequence_size = 0; + size_t byte_index = 0; + size_t source_length = utf8str.size(); + + auto open_sequence = [&]() + { + if (!result.empty() && result.back() != '\n') + result += '\n'; // Use LF as a separator before new UTF-8 sequence + result += '['; + in_sequence = true; + }; + + auto close_sequence = [&]() + { + llwchar unicode = utf8str_to_wchar(utf8str, byte_index - sequence_size, sequence_size); + if (unicode != LL_UNKNOWN_CHAR) + { + result += llformat("+%04X", unicode); + } + result += ']'; + in_sequence = false; + sequence_size = 0; + }; + + while (byte_index < source_length) + { + U8 byte = utf8str[byte_index]; + if (byte >= 0x80) // Part of an UTF-8 sequence + { + if (!in_sequence) // Start new UTF-8 sequence + { + open_sequence(); + } + else if (byte >= 0xC0) // Start another UTF-8 sequence + { + close_sequence(); + open_sequence(); + } + else // Continue the same UTF-8 sequence + { + result += '.'; + } + result += llformat("%02X", byte); // The byte is represented in hexadecimal form + ++sequence_size; + } + else // ASCII symbol is represented as a character + { + if (in_sequence) // End of UTF-8 sequence + { + close_sequence(); + if (byte != '\n') + { + result += '\n'; // Use LF as a separator between UTF-8 and ASCII + } + } + result += byte; + } + ++byte_index; + } + + if (in_sequence) // End of UTF-8 sequence + { + close_sequence(); + } + + return result; +} + // Search for any emoji symbol, return true if found bool wstring_has_emoji(const LLWString& wstr) { @@ -874,61 +1010,35 @@ std::string LLStringOps::sPM; // static bool LLStringOps::isEmoji(llwchar wch) { - // Most of the following symbols are not actually emoticons, but rather small pictures - - // 0x1F000 .. 0x1F02F - mahjong tiles - // https://symbl.cc/en/unicode/table/#mahjong-tiles - - // 0x1F030 .. 0x1F09F - domino tiles - // https://symbl.cc/en/unicode/table/#domino-tiles - - // 0x1F0A0 .. 0x1F0FF - playing cards - // https://symbl.cc/en/unicode/table/#playing-cards - - // 0x1F100 .. 0x1F1FF - enclosed alphanumeric supplement - // https://symbl.cc/en/unicode/table/#enclosed-alphanumeric-supplement - - // 0x1F200 .. 0x1F2FF - enclosed ideographic supplement - // https://symbl.cc/en/unicode/table/#enclosed-ideographic-supplement - - // 0x1F300 .. 0x1F5FF - miscellaneous symbols and pictographs - // https://symbl.cc/en/unicode/table/#miscellaneous-symbols-and-pictographs - - // 0x1F600 .. 0x1F64F - emoticons - // https://symbl.cc/en/unicode/table/#emoticons - - // 0x1F650 .. 0x1F67F - ornamental dingbats - // https://symbl.cc/en/unicode/table/#ornamental-dingbats - - // 0x1F680 .. 0x1F6FF - transport and map symbols - // https://symbl.cc/en/unicode/table/#transport-and-map-symbols - - // 0x1F700 .. 0x1F77F - alchemical symbols - // https://symbl.cc/en/unicode/table/#alchemical-symbols - - // 0x1F780 .. 0x1F7FF - geometric shapes extended - // https://symbl.cc/en/unicode/table/#geometric-shapes-extended - - // 0x1F800 .. 0x1F8FF - supplemental arrows c - // https://symbl.cc/en/unicode/table/#supplemental-arrows-c - - // 0x1F900 .. 0x1F9FF - supplemental symbols and pictographs - // https://symbl.cc/en/unicode/table/#supplemental-symbols-and-pictographs - - // 0x1FA00 .. 0x1FA6F - chess symbols - // https://symbl.cc/en/unicode/table/#chess-symbols - - // 0x1FA70 .. 0x1FAFF - symbols and pictographs extended a - // https://symbl.cc/en/unicode/table/#symbols-and-pictographs-extended-a - - // 0x1FB00 .. 0x1FBFF - symbols for legacy computing - // https://symbl.cc/en/unicode/table/#symbols-for-legacy-computing - - // 0x1FC00 .. 0x1FFFF - undefined block 44 - // These symbols aren't defined yet - // https://symbl.cc/en/unicode/table/#undefined-block-44 - - return wch >= 0x1F000 && wch < 0x1FC00; + int ublock = ublock_getCode(wch); + switch (ublock) + { + case UBLOCK_GENERAL_PUNCTUATION: + case UBLOCK_LETTERLIKE_SYMBOLS: + case UBLOCK_ARROWS: + case UBLOCK_MISCELLANEOUS_TECHNICAL: + case UBLOCK_ENCLOSED_ALPHANUMERICS: + case UBLOCK_GEOMETRIC_SHAPES: + case UBLOCK_MISCELLANEOUS_SYMBOLS: + case UBLOCK_DINGBATS: + case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: + case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS: + case UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS: + case UBLOCK_EMOTICONS: + case UBLOCK_TRANSPORT_AND_MAP_SYMBOLS: +#if U_ICU_VERSION_MAJOR_NUM > 56 + // Boost uses ICU so we can't update it independently + case UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS: +#endif // U_ICU_VERSION_MAJOR_NUM > 56 + return true; + default: +#if U_ICU_VERSION_MAJOR_NUM > 56 + return false; +#else + // See https://en.wikipedia.org/wiki/Supplemental_Symbols_and_Pictographs + return wch >= 0x1F900 && wch <= 0x1F9FF; +#endif // U_ICU_VERSION_MAJOR_NUM > 56 + } } |