summaryrefslogtreecommitdiff
path: root/indra/llcommon/llstring.cpp
diff options
context:
space:
mode:
authorNat Goodspeed <nat@lindenlab.com>2024-03-08 10:58:24 -0500
committerNat Goodspeed <nat@lindenlab.com>2024-03-08 10:58:24 -0500
commitee8bc528bbf97ee8cc488651fb263b07b5c1c818 (patch)
tree292aa9a4b26749cc877f6ea8e500ec24793a20da /indra/llcommon/llstring.cpp
parentde71c6378e60c0f0ea0c5537729f052da8513b19 (diff)
parentafc943acbc2bb79e2e1aa5d5eaf448e01b6c2b00 (diff)
Merge branch 'main' into release/luau-scripting for Emoji release.
Diffstat (limited to 'indra/llcommon/llstring.cpp')
-rw-r--r--indra/llcommon/llstring.cpp175
1 files changed, 173 insertions, 2 deletions
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index f6629803ee..82dc7c9f80 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -30,6 +30,7 @@
#include "llerror.h"
#include "llfasttimer.h"
#include "llsd.h"
+#include <unicode/uchar.h>
#include <vector>
#if LL_WINDOWS
@@ -338,8 +339,6 @@ S32 wchar_utf8_length(const llwchar wc)
{
if (wc < 0x80)
{
- // This case will also catch negative values which are
- // technically invalid.
return 1;
}
else if (wc < 0x800)
@@ -364,6 +363,30 @@ S32 wchar_utf8_length(const llwchar wc)
}
}
+std::string wchar_utf8_preview(const llwchar wc)
+{
+ std::ostringstream oss;
+ oss << std::hex << std::uppercase << (U32)wc;
+
+ U8 out_bytes[8];
+ U32 size = (U32)wchar_to_utf8chars(wc, (char*)out_bytes);
+
+ if (size > 1)
+ {
+ oss << " [";
+ for (U32 i = 0; i < size; ++i)
+ {
+ if (i)
+ {
+ oss << ", ";
+ }
+ oss << (int)out_bytes[i];
+ }
+ oss << "]";
+ }
+
+ return oss.str();
+}
S32 wstring_utf8_length(const LLWString& wstr)
{
@@ -600,6 +623,7 @@ std::string mbcsstring_makeASCII(const std::string& wstr)
}
return out_str;
}
+
std::string utf8str_removeCRLF(const std::string& utf8str)
{
if (0 == utf8str.length())
@@ -621,6 +645,119 @@ std::string utf8str_removeCRLF(const std::string& utf8str)
return out;
}
+llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length)
+{
+ switch (length)
+ {
+ case 2:
+ return ((utf8str[offset] & 0x1F) << 6) +
+ (utf8str[offset + 1] & 0x3F);
+ case 3:
+ return ((utf8str[offset] & 0x0F) << 12) +
+ ((utf8str[offset + 1] & 0x3F) << 6) +
+ (utf8str[offset + 2] & 0x3F);
+ case 4:
+ return ((utf8str[offset] & 0x07) << 18) +
+ ((utf8str[offset + 1] & 0x3F) << 12) +
+ ((utf8str[offset + 2] & 0x3F) << 6) +
+ (utf8str[offset + 3] & 0x3F);
+ case 5:
+ return ((utf8str[offset] & 0x03) << 24) +
+ ((utf8str[offset + 1] & 0x3F) << 18) +
+ ((utf8str[offset + 2] & 0x3F) << 12) +
+ ((utf8str[offset + 3] & 0x3F) << 6) +
+ (utf8str[offset + 4] & 0x3F);
+ case 6:
+ return ((utf8str[offset] & 0x01) << 30) +
+ ((utf8str[offset + 1] & 0x3F) << 24) +
+ ((utf8str[offset + 2] & 0x3F) << 18) +
+ ((utf8str[offset + 3] & 0x3F) << 12) +
+ ((utf8str[offset + 4] & 0x3F) << 6) +
+ (utf8str[offset + 5] & 0x3F);
+ case 7:
+ return ((utf8str[offset + 1] & 0x03) << 30) +
+ ((utf8str[offset + 2] & 0x3F) << 24) +
+ ((utf8str[offset + 3] & 0x3F) << 18) +
+ ((utf8str[offset + 4] & 0x3F) << 12) +
+ ((utf8str[offset + 5] & 0x3F) << 6) +
+ (utf8str[offset + 6] & 0x3F);
+ }
+ return LL_UNKNOWN_CHAR;
+}
+
+std::string utf8str_showBytesUTF8(const std::string& utf8str)
+{
+ std::string result;
+
+ bool in_sequence = false;
+ size_t sequence_size = 0;
+ size_t byte_index = 0;
+ size_t source_length = utf8str.size();
+
+ auto open_sequence = [&]()
+ {
+ if (!result.empty() && result.back() != '\n')
+ result += '\n'; // Use LF as a separator before new UTF-8 sequence
+ result += '[';
+ in_sequence = true;
+ };
+
+ auto close_sequence = [&]()
+ {
+ llwchar unicode = utf8str_to_wchar(utf8str, byte_index - sequence_size, sequence_size);
+ if (unicode != LL_UNKNOWN_CHAR)
+ {
+ result += llformat("+%04X", unicode);
+ }
+ result += ']';
+ in_sequence = false;
+ sequence_size = 0;
+ };
+
+ while (byte_index < source_length)
+ {
+ U8 byte = utf8str[byte_index];
+ if (byte >= 0x80) // Part of an UTF-8 sequence
+ {
+ if (!in_sequence) // Start new UTF-8 sequence
+ {
+ open_sequence();
+ }
+ else if (byte >= 0xC0) // Start another UTF-8 sequence
+ {
+ close_sequence();
+ open_sequence();
+ }
+ else // Continue the same UTF-8 sequence
+ {
+ result += '.';
+ }
+ result += llformat("%02X", byte); // The byte is represented in hexadecimal form
+ ++sequence_size;
+ }
+ else // ASCII symbol is represented as a character
+ {
+ if (in_sequence) // End of UTF-8 sequence
+ {
+ close_sequence();
+ if (byte != '\n')
+ {
+ result += '\n'; // Use LF as a separator between UTF-8 and ASCII
+ }
+ }
+ result += byte;
+ }
+ ++byte_index;
+ }
+
+ if (in_sequence) // End of UTF-8 sequence
+ {
+ close_sequence();
+ }
+
+ return result;
+}
+
#if LL_WINDOWS
unsigned int ll_wstring_default_code_page()
{
@@ -833,6 +970,40 @@ std::string LLStringOps::sDayFormat;
std::string LLStringOps::sAM;
std::string LLStringOps::sPM;
+// static
+bool LLStringOps::isEmoji(llwchar wch)
+{
+ int ublock = ublock_getCode(wch);
+ switch (ublock)
+ {
+ case UBLOCK_GENERAL_PUNCTUATION:
+ case UBLOCK_LETTERLIKE_SYMBOLS:
+ case UBLOCK_ARROWS:
+ case UBLOCK_MISCELLANEOUS_TECHNICAL:
+ case UBLOCK_ENCLOSED_ALPHANUMERICS:
+ case UBLOCK_GEOMETRIC_SHAPES:
+ case UBLOCK_MISCELLANEOUS_SYMBOLS:
+ case UBLOCK_DINGBATS:
+ case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
+ case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS:
+ case UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS:
+ case UBLOCK_EMOTICONS:
+ case UBLOCK_TRANSPORT_AND_MAP_SYMBOLS:
+#if U_ICU_VERSION_MAJOR_NUM > 56
+ // Boost uses ICU so we can't update it independently
+ case UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS:
+#endif // U_ICU_VERSION_MAJOR_NUM > 56
+ return true;
+ default:
+#if U_ICU_VERSION_MAJOR_NUM > 56
+ return false;
+#else
+ // See https://en.wikipedia.org/wiki/Supplemental_Symbols_and_Pictographs
+ return wch >= 0x1F900 && wch <= 0x1F9FF;
+#endif // U_ICU_VERSION_MAJOR_NUM > 56
+ }
+}
+
S32 LLStringOps::collate(const llwchar* a, const llwchar* b)
{