From 5a6ddb2ea666e895890d3cb690cce5101cf12652 Mon Sep 17 00:00:00 2001
From: Kitty Barnett <develop@catznip.com>
Date: Thu, 7 Nov 2019 17:15:21 +0100
Subject: Fallback fonts can have first crack at adding an unknown character +
 set Twemoji as the viewer's fallback for all emoji blocks

---
 indra/llcommon/llstring.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'indra/llcommon/llstring.cpp')
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index 0174c411b4..b272728200 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -30,6 +30,7 @@
 #include "llerror.h"
 #include "llfasttimer.h"
 #include "llsd.h"
+#include <unicode/uchar.h>
 #include <vector>
 
 #if LL_WINDOWS
@@ -888,6 +889,31 @@ std::string LLStringOps::sDayFormat;
 std::string LLStringOps::sAM;
 std::string LLStringOps::sPM;
 
+// static
+bool LLStringOps::isEmoji(llwchar wch)
+{
+	switch (ublock_getCode(wch))
+	{
+		case UBLOCK_MISCELLANEOUS_SYMBOLS:
+		case UBLOCK_DINGBATS:
+		case UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS:
+		case UBLOCK_EMOTICONS:
+		case UBLOCK_TRANSPORT_AND_MAP_SYMBOLS:
+#if U_ICU_VERSION_MAJOR_NUM > 56
+		// Boost uses ICU so we can't update it independently
+		case UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS:
+#endif // U_ICU_VERSION_MAJOR_NUM > 56
+			return true;
+		default:
+#if U_ICU_VERSION_MAJOR_NUM > 56
+			return false;
+#else
+			// See https://en.wikipedia.org/wiki/Supplemental_Symbols_and_Pictographs
+			return wch >= 0x1F900 && wch <= 0x1F9FF;
+#endif // U_ICU_VERSION_MAJOR_NUM > 56
+	}
+}
+
 
 S32	LLStringOps::collate(const llwchar* a, const llwchar* b)
 { 
-- 
cgit v1.2.3


From 671978e3927bc3ba9fc34008bbb7efd6f07b6c81 Mon Sep 17 00:00:00 2001
From: Alexander Gavriliuk <alexandrgproductengine@lindenlab.com>
Date: Wed, 17 May 2023 14:28:36 +0200
Subject: SL-19575 Create emoji gallery (fix bug with drawing emojis in chat
 history)

---
 indra/llcommon/llstring.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'indra/llcommon/llstring.cpp')

diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index cda1791e45..d68cbaa22c 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -837,10 +837,19 @@ std::string LLStringOps::sPM;
 // static
 bool LLStringOps::isEmoji(llwchar wch)
 {
-	switch (ublock_getCode(wch))
-	{
+	int ublock = ublock_getCode(wch);
+	switch (ublock)
+	{
+		case UBLOCK_GENERAL_PUNCTUATION:
+		case UBLOCK_LETTERLIKE_SYMBOLS:
+		case UBLOCK_ARROWS:
+		case UBLOCK_MISCELLANEOUS_TECHNICAL:
+		case UBLOCK_ENCLOSED_ALPHANUMERICS:
+		case UBLOCK_GEOMETRIC_SHAPES:
 		case UBLOCK_MISCELLANEOUS_SYMBOLS:
 		case UBLOCK_DINGBATS:
+		case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
+		case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS:
 		case UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS:
 		case UBLOCK_EMOTICONS:
 		case UBLOCK_TRANSPORT_AND_MAP_SYMBOLS:
-- 
cgit v1.2.3


From 2fad5a770b3583e576992d075c24bc0e25443053 Mon Sep 17 00:00:00 2001
From: Alexander Gavriliuk <alexandrgproductengine@lindenlab.com>
Date: Thu, 30 Nov 2023 13:59:14 +0100
Subject: SL-19801 Log unicode characters for debug

---
 indra/llcommon/llstring.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'indra/llcommon/llstring.cpp')

diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index d68cbaa22c..81b0207038 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -365,6 +365,30 @@ S32 wchar_utf8_length(const llwchar wc)
 	}
 }
 
+std::string wchar_utf8_preview(const llwchar wc)
+{
+    std::ostringstream oss;
+    oss << std::hex << std::uppercase << (U32)wc;
+
+    U8 out_bytes[8];
+    U32 size = (U32)wchar_to_utf8chars(wc, (char*)out_bytes);
+
+    if (size > 1)
+    {
+        oss << " [";
+        for (U32 i = 0; i < size; ++i)
+        {
+            if (i)
+            {
+                oss << ", ";
+            }
+            oss << (int)out_bytes[i];
+        }
+        oss << "]";
+    }
+
+    return oss.str();
+}
 
 S32 wstring_utf8_length(const LLWString& wstr)
 {
-- 
cgit v1.2.3


From ae91ae43a51c58cc496f3947921fbf886c6be86e Mon Sep 17 00:00:00 2001
From: Alexander Gavriliuk <alexandrgproductengine@lindenlab.com>
Date: Mon, 15 Jan 2024 23:20:24 +0100
Subject: SL-20795 Part of previously typed emojis disappear in the 'Save
 settings as a preset...' option of the 'Preferences' floater

---
 indra/llcommon/llstring.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'indra/llcommon/llstring.cpp')

diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index 81b0207038..17d69351ec 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -339,8 +339,6 @@ S32 wchar_utf8_length(const llwchar wc)
 {
 	if (wc < 0x80)
 	{
-		// This case will also catch negative values which are
-		// technically invalid.
 		return 1;
 	}
 	else if (wc < 0x800)
-- 
cgit v1.2.3


From 7075717b7c4a57d6bef60697ee506096a7c1b1ab Mon Sep 17 00:00:00 2001
From: Alexander Gavriliuk <alexandrgproductengine@lindenlab.com>
Date: Wed, 7 Feb 2024 21:26:57 +0100
Subject: SL-20363 Add Advanced option 'Debug Unicode'

---
 indra/llcommon/llstring.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'indra/llcommon/llstring.cpp')

diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index 17d69351ec..ab34262515 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -623,6 +623,7 @@ std::string mbcsstring_makeASCII(const std::string& wstr)
 	}
 	return out_str;
 }
+
 std::string utf8str_removeCRLF(const std::string& utf8str)
 {
 	if (0 == utf8str.length())
@@ -644,6 +645,54 @@ std::string utf8str_removeCRLF(const std::string& utf8str)
 	return out;
 }
 
+std::string utf8str_showBytesUTF8(const std::string& utf8str)
+{
+    std::string result;
+
+    bool in_sequence = false;
+    for (U8 byte : utf8str)
+    {
+        if (byte >= 0x80) // Part of an UTF-8 sequence
+        {
+            if (!in_sequence) // Start new UTF-8 sequence
+            {
+                if (!result.empty() && result.back() != ' ')
+                    result += ' '; // Use space as separator between ASCII and UTF-8
+                result += '[';
+            }
+            else if (byte >= 0xC0) // Start another UTF-8 sequence
+            {
+                result += "] ["; // Use space as separator between UTF-8 and UTF-8
+            }
+            else // Continue the same UTF-8 sequence
+            {
+                result += '.';
+            }
+            result += llformat("%02X", byte); // The byte is represented in hexadecimal form
+            in_sequence = true;
+        }
+        else // ASCII symbol is represented as a character
+        {
+            if (in_sequence) // End of UTF-8 sequence
+            {
+                result += ']';
+                if (byte != ' ')
+                {
+                    result += ' '; // Use space as separator between UTF-8 and ASCII
+                }
+            }
+            result += byte;
+            in_sequence = false;
+        }
+    }
+    if (in_sequence) // End of UTF-8 sequence
+    {
+        result += ']';
+    }
+
+    return result;
+}
+
 #if LL_WINDOWS
 unsigned int ll_wstring_default_code_page()
 {
-- 
cgit v1.2.3


From afc9252372b2b511bb3f7caaaa0856989bbd3f46 Mon Sep 17 00:00:00 2001
From: Alexander Gavriliuk <alexandrgproductengine@lindenlab.com>
Date: Thu, 8 Feb 2024 21:55:59 +0100
Subject: SL-20363 Option 'Debug Unicode' - show unicode values

---
 indra/llcommon/llstring.cpp | 87 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 76 insertions(+), 11 deletions(-)

(limited to 'indra/llcommon/llstring.cpp')

diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index ab34262515..82dc7c9f80 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -645,49 +645,114 @@ std::string utf8str_removeCRLF(const std::string& utf8str)
 	return out;
 }
 
+llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length)
+{
+    switch (length)
+    {
+    case 2:
+        return ((utf8str[offset] & 0x1F) << 6) +
+                (utf8str[offset + 1] & 0x3F);
+    case 3:
+        return ((utf8str[offset] & 0x0F) << 12) +
+                ((utf8str[offset + 1] & 0x3F) << 6) +
+                (utf8str[offset + 2] & 0x3F);
+    case 4:
+        return ((utf8str[offset] & 0x07) << 18) +
+                ((utf8str[offset + 1] & 0x3F) << 12) +
+                ((utf8str[offset + 2] & 0x3F) << 6) +
+                (utf8str[offset + 3] & 0x3F);
+    case 5:
+        return ((utf8str[offset] & 0x03) << 24) +
+                ((utf8str[offset + 1] & 0x3F) << 18) +
+                ((utf8str[offset + 2] & 0x3F) << 12) +
+                ((utf8str[offset + 3] & 0x3F) << 6) +
+                (utf8str[offset + 4] & 0x3F);
+    case 6:
+        return ((utf8str[offset] & 0x01) << 30) +
+                ((utf8str[offset + 1] & 0x3F) << 24) +
+                ((utf8str[offset + 2] & 0x3F) << 18) +
+                ((utf8str[offset + 3] & 0x3F) << 12) +
+                ((utf8str[offset + 4] & 0x3F) << 6) +
+                (utf8str[offset + 5] & 0x3F);
+    case 7:
+        return ((utf8str[offset + 1] & 0x03) << 30) +
+                ((utf8str[offset + 2] & 0x3F) << 24) +
+                ((utf8str[offset + 3] & 0x3F) << 18) +
+                ((utf8str[offset + 4] & 0x3F) << 12) +
+                ((utf8str[offset + 5] & 0x3F) << 6) +
+                (utf8str[offset + 6] & 0x3F);
+    }
+    return LL_UNKNOWN_CHAR;
+}
+
 std::string utf8str_showBytesUTF8(const std::string& utf8str)
 {
     std::string result;
 
     bool in_sequence = false;
-    for (U8 byte : utf8str)
+    size_t sequence_size = 0;
+    size_t byte_index = 0;
+    size_t source_length = utf8str.size();
+
+    auto open_sequence = [&]()
+        {
+            if (!result.empty() && result.back() != '\n')
+                result += '\n'; // Use LF as a separator before new UTF-8 sequence
+            result += '[';
+            in_sequence = true;
+        };
+
+    auto close_sequence = [&]()
+        {
+            llwchar unicode = utf8str_to_wchar(utf8str, byte_index - sequence_size, sequence_size);
+            if (unicode != LL_UNKNOWN_CHAR)
+            {
+                result += llformat("+%04X", unicode);
+            }
+            result += ']';
+            in_sequence = false;
+            sequence_size = 0;
+        };
+
+    while (byte_index < source_length)
     {
+        U8 byte = utf8str[byte_index];
         if (byte >= 0x80) // Part of an UTF-8 sequence
         {
             if (!in_sequence) // Start new UTF-8 sequence
             {
-                if (!result.empty() && result.back() != ' ')
-                    result += ' '; // Use space as separator between ASCII and UTF-8
-                result += '[';
+                open_sequence();
             }
             else if (byte >= 0xC0) // Start another UTF-8 sequence
             {
-                result += "] ["; // Use space as separator between UTF-8 and UTF-8
+                close_sequence();
+                open_sequence();
             }
             else // Continue the same UTF-8 sequence
             {
                 result += '.';
             }
             result += llformat("%02X", byte); // The byte is represented in hexadecimal form
-            in_sequence = true;
+            ++sequence_size;
         }
         else // ASCII symbol is represented as a character
         {
             if (in_sequence) // End of UTF-8 sequence
             {
-                result += ']';
-                if (byte != ' ')
+                close_sequence();
+                if (byte != '\n')
                 {
-                    result += ' '; // Use space as separator between UTF-8 and ASCII
+                    result += '\n'; // Use LF as a separator between UTF-8 and ASCII
                 }
             }
             result += byte;
-            in_sequence = false;
         }
+        ++byte_index;
     }
+
     if (in_sequence) // End of UTF-8 sequence
     {
-        result += ']';
+        close_sequence();
     }
 
     return result;
-- 
cgit v1.2.3