diff options
author | Alexander Gavriliuk <alexandrgproductengine@lindenlab.com> | 2024-02-08 21:55:59 +0100 |
---|---|---|
committer | Guru <alexandrgproductengine@lindenlab.com> | 2024-02-09 09:16:17 +0300 |
commit | afc9252372b2b511bb3f7caaaa0856989bbd3f46 (patch) | |
tree | a1ae5fd9ae642a70441dc2b3d2214850bc22d1cb /indra/llcommon | |
parent | 7075717b7c4a57d6bef60697ee506096a7c1b1ab (diff) |
SL-20363 Option 'Debug Unicode' - show unicode values
Diffstat (limited to 'indra/llcommon')
-rw-r--r-- | indra/llcommon/llstring.cpp | 87 | ||||
-rw-r--r-- | indra/llcommon/llstring.h | 2 |
2 files changed, 78 insertions, 11 deletions
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp index ab34262515..82dc7c9f80 100644 --- a/indra/llcommon/llstring.cpp +++ b/indra/llcommon/llstring.cpp @@ -645,49 +645,114 @@ std::string utf8str_removeCRLF(const std::string& utf8str) return out; } +llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length) +{ + switch (length) + { + case 2: + return ((utf8str[offset] & 0x1F) << 6) + + (utf8str[offset + 1] & 0x3F); + case 3: + return ((utf8str[offset] & 0x0F) << 12) + + ((utf8str[offset + 1] & 0x3F) << 6) + + (utf8str[offset + 2] & 0x3F); + case 4: + return ((utf8str[offset] & 0x07) << 18) + + ((utf8str[offset + 1] & 0x3F) << 12) + + ((utf8str[offset + 2] & 0x3F) << 6) + + (utf8str[offset + 3] & 0x3F); + case 5: + return ((utf8str[offset] & 0x03) << 24) + + ((utf8str[offset + 1] & 0x3F) << 18) + + ((utf8str[offset + 2] & 0x3F) << 12) + + ((utf8str[offset + 3] & 0x3F) << 6) + + (utf8str[offset + 4] & 0x3F); + case 6: + return ((utf8str[offset] & 0x01) << 30) + + ((utf8str[offset + 1] & 0x3F) << 24) + + ((utf8str[offset + 2] & 0x3F) << 18) + + ((utf8str[offset + 3] & 0x3F) << 12) + + ((utf8str[offset + 4] & 0x3F) << 6) + + (utf8str[offset + 5] & 0x3F); + case 7: + return ((utf8str[offset + 1] & 0x03) << 30) + + ((utf8str[offset + 2] & 0x3F) << 24) + + ((utf8str[offset + 3] & 0x3F) << 18) + + ((utf8str[offset + 4] & 0x3F) << 12) + + ((utf8str[offset + 5] & 0x3F) << 6) + + (utf8str[offset + 6] & 0x3F); + } + return LL_UNKNOWN_CHAR; +} + std::string utf8str_showBytesUTF8(const std::string& utf8str) { std::string result; bool in_sequence = false; - for (U8 byte : utf8str) + size_t sequence_size = 0; + size_t byte_index = 0; + size_t source_length = utf8str.size(); + + auto open_sequence = [&]() + { + if (!result.empty() && result.back() != '\n') + result += '\n'; // Use LF as a separator before new UTF-8 sequence + result += '['; + in_sequence = true; + }; + + auto close_sequence = [&]() + { + llwchar unicode = utf8str_to_wchar(utf8str, byte_index - sequence_size, sequence_size); + if (unicode != LL_UNKNOWN_CHAR) + { + result += llformat("+%04X", unicode); + } + result += ']'; + in_sequence = false; + sequence_size = 0; + }; + + while (byte_index < source_length) { + U8 byte = utf8str[byte_index]; if (byte >= 0x80) // Part of an UTF-8 sequence { if (!in_sequence) // Start new UTF-8 sequence { - if (!result.empty() && result.back() != ' ') - result += ' '; // Use space as separator between ASCII and UTF-8 - result += '['; + open_sequence(); } else if (byte >= 0xC0) // Start another UTF-8 sequence { - result += "] ["; // Use space as separator between UTF-8 and UTF-8 + close_sequence(); + open_sequence(); } else // Continue the same UTF-8 sequence { result += '.'; } result += llformat("%02X", byte); // The byte is represented in hexadecimal form - in_sequence = true; + ++sequence_size; } else // ASCII symbol is represented as a character { if (in_sequence) // End of UTF-8 sequence { - result += ']'; - if (byte != ' ') + close_sequence(); + if (byte != '\n') { - result += ' '; // Use space as separator between UTF-8 and ASCII + result += '\n'; // Use LF as a separator between UTF-8 and ASCII } } result += byte; - in_sequence = false; } + ++byte_index; } + if (in_sequence) // End of UTF-8 sequence { - result += ']'; + close_sequence(); } return result; diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h index 38b9c3e23c..bfbf25d9ab 100644 --- a/indra/llcommon/llstring.h +++ b/indra/llcommon/llstring.h @@ -743,6 +743,8 @@ LL_COMMON_API std::string mbcsstring_makeASCII(const std::string& str); LL_COMMON_API std::string utf8str_removeCRLF(const std::string& utf8str); +LL_COMMON_API llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length); + LL_COMMON_API std::string utf8str_showBytesUTF8(const std::string& utf8str); #if LL_WINDOWS |