summaryrefslogtreecommitdiff
path: root/indra/llcommon
diff options
context:
space:
mode:
authorAlexander Gavriliuk <alexandrgproductengine@lindenlab.com>2024-02-08 21:55:59 +0100
committerGuru <alexandrgproductengine@lindenlab.com>2024-02-09 09:16:17 +0300
commitafc9252372b2b511bb3f7caaaa0856989bbd3f46 (patch)
treea1ae5fd9ae642a70441dc2b3d2214850bc22d1cb /indra/llcommon
parent7075717b7c4a57d6bef60697ee506096a7c1b1ab (diff)
SL-20363 Option 'Debug Unicode' - show unicode values
Diffstat (limited to 'indra/llcommon')
-rw-r--r--indra/llcommon/llstring.cpp87
-rw-r--r--indra/llcommon/llstring.h2
2 files changed, 78 insertions, 11 deletions
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index ab34262515..82dc7c9f80 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -645,49 +645,114 @@ std::string utf8str_removeCRLF(const std::string& utf8str)
return out;
}
+llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length)
+{
+ switch (length)
+ {
+ case 2:
+ return ((utf8str[offset] & 0x1F) << 6) +
+ (utf8str[offset + 1] & 0x3F);
+ case 3:
+ return ((utf8str[offset] & 0x0F) << 12) +
+ ((utf8str[offset + 1] & 0x3F) << 6) +
+ (utf8str[offset + 2] & 0x3F);
+ case 4:
+ return ((utf8str[offset] & 0x07) << 18) +
+ ((utf8str[offset + 1] & 0x3F) << 12) +
+ ((utf8str[offset + 2] & 0x3F) << 6) +
+ (utf8str[offset + 3] & 0x3F);
+ case 5:
+ return ((utf8str[offset] & 0x03) << 24) +
+ ((utf8str[offset + 1] & 0x3F) << 18) +
+ ((utf8str[offset + 2] & 0x3F) << 12) +
+ ((utf8str[offset + 3] & 0x3F) << 6) +
+ (utf8str[offset + 4] & 0x3F);
+ case 6:
+ return ((utf8str[offset] & 0x01) << 30) +
+ ((utf8str[offset + 1] & 0x3F) << 24) +
+ ((utf8str[offset + 2] & 0x3F) << 18) +
+ ((utf8str[offset + 3] & 0x3F) << 12) +
+ ((utf8str[offset + 4] & 0x3F) << 6) +
+ (utf8str[offset + 5] & 0x3F);
+ case 7:
+ return ((utf8str[offset + 1] & 0x03) << 30) +
+ ((utf8str[offset + 2] & 0x3F) << 24) +
+ ((utf8str[offset + 3] & 0x3F) << 18) +
+ ((utf8str[offset + 4] & 0x3F) << 12) +
+ ((utf8str[offset + 5] & 0x3F) << 6) +
+ (utf8str[offset + 6] & 0x3F);
+ }
+ return LL_UNKNOWN_CHAR;
+}
+
std::string utf8str_showBytesUTF8(const std::string& utf8str)
{
std::string result;
bool in_sequence = false;
- for (U8 byte : utf8str)
+ size_t sequence_size = 0;
+ size_t byte_index = 0;
+ size_t source_length = utf8str.size();
+
+ auto open_sequence = [&]()
+ {
+ if (!result.empty() && result.back() != '\n')
+ result += '\n'; // Use LF as a separator before new UTF-8 sequence
+ result += '[';
+ in_sequence = true;
+ };
+
+ auto close_sequence = [&]()
+ {
+ llwchar unicode = utf8str_to_wchar(utf8str, byte_index - sequence_size, sequence_size);
+ if (unicode != LL_UNKNOWN_CHAR)
+ {
+ result += llformat("+%04X", unicode);
+ }
+ result += ']';
+ in_sequence = false;
+ sequence_size = 0;
+ };
+
+ while (byte_index < source_length)
{
+ U8 byte = utf8str[byte_index];
if (byte >= 0x80) // Part of an UTF-8 sequence
{
if (!in_sequence) // Start new UTF-8 sequence
{
- if (!result.empty() && result.back() != ' ')
- result += ' '; // Use space as separator between ASCII and UTF-8
- result += '[';
+ open_sequence();
}
else if (byte >= 0xC0) // Start another UTF-8 sequence
{
- result += "] ["; // Use space as separator between UTF-8 and UTF-8
+ close_sequence();
+ open_sequence();
}
else // Continue the same UTF-8 sequence
{
result += '.';
}
result += llformat("%02X", byte); // The byte is represented in hexadecimal form
- in_sequence = true;
+ ++sequence_size;
}
else // ASCII symbol is represented as a character
{
if (in_sequence) // End of UTF-8 sequence
{
- result += ']';
- if (byte != ' ')
+ close_sequence();
+ if (byte != '\n')
{
- result += ' '; // Use space as separator between UTF-8 and ASCII
+ result += '\n'; // Use LF as a separator between UTF-8 and ASCII
}
}
result += byte;
- in_sequence = false;
}
+ ++byte_index;
}
+
if (in_sequence) // End of UTF-8 sequence
{
- result += ']';
+ close_sequence();
}
return result;
diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h
index 38b9c3e23c..bfbf25d9ab 100644
--- a/indra/llcommon/llstring.h
+++ b/indra/llcommon/llstring.h
@@ -743,6 +743,8 @@ LL_COMMON_API std::string mbcsstring_makeASCII(const std::string& str);
LL_COMMON_API std::string utf8str_removeCRLF(const std::string& utf8str);
+LL_COMMON_API llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length);
+
LL_COMMON_API std::string utf8str_showBytesUTF8(const std::string& utf8str);
#if LL_WINDOWS