SL-16207: Update llstring.h handling of different string types.

In llpreprocessor.h, consider the case of clang on Windows: #define LL_WCHAR_T_NATIVE there as well as for the Microsoft compiler with /Zc:wchar_t switch. In stdtypes.h, inject a LLWCHAR_IS_WCHAR_T symbol to allow the preprocessor to make decisions about when the types are identical. llstring.h's conversion logic deals with three types of wide strings (LLWString, std::wstring and utf16string) based on three types of wide char (llwchar, wchar_t and U16, respectively). Sometimes they're three distinct types, sometimes wchar_t is identical to llwchar and sometimes wchar_t is identical to U16. Rationalize the three cases using ll_convert_u16_alias() and new ll_convert_wstr_alias() macros. stringize.h was directly calling wstring_to_utf8str() and utf8str_to_wstring(), which was producing errors with VS 2019 clang since there isn't actually a wstring_to_utf8str(std::wstring) overload. Use ll_convert<std::string>() instead, since that redirects to the relevant ll_convert_wide_to_string() function. (And now you see why we've been trying to migrate to the uniform ll_convert<target>() wrapper!) Similarly, call ll_convert<std::wstring>() instead of a two-step conversion from utf8str_to_wstring(), producing LLWString, then a character-by-character copy from LLWString to std::wstring. That isn't even correct: on Windows, we should be encoding from UTF32 to UTF16.
author: Nat Goodspeed <nat@lindenlab.com> 2021-10-27 13:01:37 -0400
committer: Nat Goodspeed <nat@lindenlab.com> 2021-10-27 13:01:37 -0400
commit: af5c5a994b90a27e16ef6f2f5044e096269e4217 (patch)
tree: b8b1c55265a02da48b92e2ec412fd79045c836b5
parent: cbaba2df56c66926e051d50b6cb02955c81c2a6c (diff)
4 files changed, 50 insertions, 39 deletions
diff --git a/indra/llcommon/llpreprocessor.h b/indra/llcommon/llpreprocessor.h
index b17a8e761a..dc586b0008 100644
--- a/indra/llcommon/llpreprocessor.h
+++ b/indra/llcommon/llpreprocessor.h
@@ -171,7 +171,9 @@
 #define LL_DLLIMPORT
 #endif // LL_WINDOWS
 
-#if ! defined(LL_WINDOWS)
+#if __clang__ || ! defined(LL_WINDOWS)
+// Only on Windows, and only with the Microsoft compiler (vs. clang) is
+// wchar_t potentially not a distinct type.
 #define LL_WCHAR_T_NATIVE 1
 #else  // LL_WINDOWS
 // https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h
index 4263122f36..89e95ef40a 100644
--- a/indra/llcommon/llstring.h
+++ b/indra/llcommon/llstring.h
@@ -535,6 +535,11 @@ struct ll_convert_impl<TO, FROM>                            \
     TO operator()(const FROM& in) const { return EXPR; }    \
 }
 
+// If all we're doing is copying characters, pass this as EXPR. Since it
+// expands into the 'return EXPR' slot in the ll_convert_impl specialization
+// above, it implies TO{ in.begin(), in.end() }.
+#define LL_CONVERT_COPY_CHARS { in.begin(), in.end() }
+
 // Make the incoming string a utf8 string. Replaces any unknown glyph
 // with the UNKNOWN_CHARACTER. Once any unknown glyph is found, the rest
 // of the data may not be recovered.
@@ -571,30 +576,31 @@ LL_COMMON_API std::string rawstr_to_utf8(const std::string& raw);
 // LL_WCHAR_T_NATIVE.
 typedef std::basic_string<U16> llutf16string;
 
-#if ! defined(LL_WCHAR_T_NATIVE)
-// wchar_t is identical to U16, and std::wstring is identical to llutf16string.
-// Defining an ll_convert alias involving llutf16string would collide with the
-// comparable preferred alias involving std::wstring. (In this scenario, if
-// you pass llutf16string, it will engage the std::wstring specialization.)
-#define ll_convert_u16_alias(TO, FROM, EXPR) // nothing
-#else  // defined(LL_WCHAR_T_NATIVE)
-// wchar_t is a distinct native type, so llutf16string is also a distinct
-// type, and there IS a point to converting separately to/from llutf16string.
-// (But why? Windows APIs are still defined in terms of wchar_t, and
-// in this scenario llutf16string won't work for them!)
-#define ll_convert_u16_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR)
-
-#if LL_WINDOWS
-// LL_WCHAR_T_NATIVE is defined on non-Windows systems because, in fact,
-// wchar_t is native. Everywhere but Windows, we use it for llwchar (see
-// stdtypes.h). That makes LLWString identical to std::wstring, so these
-// aliases for std::wstring would collide with those for LLWString. Only
-// define on Windows, where converting between std::wstring and llutf16string
-// means copying chars.
-ll_convert_alias(llutf16string, std::wstring, llutf16string(in.begin(), in.end()));
-ll_convert_alias(std::wstring, llutf16string,  std::wstring(in.begin(), in.end()));
-#endif // LL_WINDOWS
-#endif // defined(LL_WCHAR_T_NATIVE)
+// Considering wchar_t, llwchar and U16, there are three relevant cases:
+#if LLWCHAR_IS_WCHAR_T         // every which way but Windows
+// llwchar is identical to wchar_t, LLWString is identical to std::wstring.
+// U16 is distinct, llutf16string is distinct (though pretty useless).
+// Given conversions to/from LLWString and to/from llutf16string, conversions
+// involving std::wstring would collide.
+#define ll_convert_wstr_alias(TO, FROM, EXPR) // nothing
+// but we can define conversions involving llutf16string without collisions
+#define  ll_convert_u16_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR)
+
+#elif defined(LL_WCHAR_T_NATIVE)    // Windows, either clang or MS /Zc:wchar_t
+// llwchar (32-bit), wchar_t (16-bit) and U16 are all different types.
+// Conversions to/from LLWString, to/from std::wstring and to/from llutf16string
+// can all be defined.
+#define ll_convert_wstr_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR)
+#define  ll_convert_u16_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR)
+
+#else  // ! LL_WCHAR_T_NATIVE: Windows with MS /Zc:wchar_t-
+// wchar_t is identical to U16, std::wstring is identical to llutf16string.
+// Given conversions to/from LLWString and to/from std::wstring, conversions
+// involving llutf16string would collide.
+#define  ll_convert_u16_alias(TO, FROM, EXPR) // nothing
+// but we can define conversions involving std::wstring without collisions
+#define ll_convert_wstr_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR)
+#endif
 
 LL_COMMON_API LLWString utf16str_to_wstring(const llutf16string &utf16str, S32 len);
 LL_COMMON_API LLWString utf16str_to_wstring(const llutf16string &utf16str);
@@ -625,9 +631,8 @@ LL_COMMON_API std::string utf16str_to_utf8str(const llutf16string &utf16str, S32
 LL_COMMON_API std::string utf16str_to_utf8str(const llutf16string &utf16str);
 ll_convert_u16_alias(std::string, llutf16string, utf16str_to_utf8str(in));
 
-#if LL_WINDOWS
+// an older alias for utf16str_to_utf8str(llutf16string)
 inline std::string wstring_to_utf8str(const llutf16string &utf16str) { return utf16str_to_utf8str(utf16str);}
-#endif
 
 // Length of this UTF32 string in bytes when transformed to UTF8
 LL_COMMON_API S32 wstring_utf8_length(const LLWString& wstr); 
@@ -715,7 +720,7 @@ inline std::string ll_convert_wide_to_string(const std::wstring& in)
 {
     return ll_convert_wide_to_string(in.c_str());
 }
-ll_convert_alias(std::string, std::wstring, ll_convert_wide_to_string(in));
+ll_convert_wstr_alias(std::string, std::wstring, ll_convert_wide_to_string(in));
 
 /**
  * Converts a string to wide string.
@@ -724,19 +729,19 @@ LL_COMMON_API std::wstring ll_convert_string_to_wide(const std::string& in,
                                                      unsigned int code_page);
 LL_COMMON_API std::wstring ll_convert_string_to_wide(const std::string& in);
                                                      // default CP_UTF8
-ll_convert_alias(std::wstring, std::string, ll_convert_string_to_wide(in));
+ll_convert_wstr_alias(std::wstring, std::string, ll_convert_string_to_wide(in));
 
 /**
  * Convert a Windows wide string to our LLWString
  */
 LL_COMMON_API LLWString ll_convert_wide_to_wstring(const std::wstring& in);
-ll_convert_alias(LLWString, std::wstring, ll_convert_wide_to_wstring(in));
+ll_convert_wstr_alias(LLWString, std::wstring, ll_convert_wide_to_wstring(in));
 
 /**
  * Convert LLWString to Windows wide string
  */
 LL_COMMON_API std::wstring ll_convert_wstring_to_wide(const LLWString& in);
-ll_convert_alias(std::wstring, LLWString, ll_convert_wstring_to_wide(in));
+ll_convert_wstr_alias(std::wstring, LLWString, ll_convert_wstring_to_wide(in));
 
 /**
  * Converts incoming string into utf8 string
diff --git a/indra/llcommon/stdtypes.h b/indra/llcommon/stdtypes.h
index 887f6ab733..b07805b628 100644
--- a/indra/llcommon/stdtypes.h
+++ b/indra/llcommon/stdtypes.h
@@ -42,10 +42,17 @@ typedef unsigned int			U32;
 // Windows wchar_t is 16-bit, whichever way /Zc:wchar_t is set. In effect,
 // Windows wchar_t is always a typedef, either for unsigned short or __wchar_t.
 // (__wchar_t, available either way, is Microsoft's native 2-byte wchar_t type.)
+// The version of clang available with VS 2019 also defines wchar_t as __wchar_t
+// which is also 16 bits.
 // In any case, llwchar should be a UTF-32 type.
 typedef U32				llwchar;
 #else
 typedef wchar_t				llwchar;
+// What we'd actually want is a simple module-scope 'if constexpr' to test
+// std::is_same<wchar_t, llwchar>::value and use that to define, or not
+// define, string conversion specializations. Since we don't have that, we'll
+// have to rely on #if instead. Sorry, Dr. Stroustrup.
+#define LLWCHAR_IS_WCHAR_T 1
 #endif
 
 #if LL_WINDOWS
diff --git a/indra/llcommon/stringize.h b/indra/llcommon/stringize.h
index 38dd198ad3..31a114f167 100644
--- a/indra/llcommon/stringize.h
+++ b/indra/llcommon/stringize.h
@@ -52,7 +52,7 @@ std::basic_string<CHARTYPE> gstringize(const T& item)
  */
 inline std::string stringize(const std::wstring& item)
 {
-    return wstring_to_utf8str(item);
+    return ll_convert<std::string>(item);
 }
 
 /**
@@ -72,8 +72,7 @@ inline std::wstring wstringize(const std::string& item)
 {
     // utf8str_to_wstring() returns LLWString, which isn't necessarily the
     // same as std::wstring
-    LLWString s(utf8str_to_wstring(item));
-    return std::wstring(s.begin(), s.end());
+    return ll_convert<std::wstring>(item);
 }
 
 /**
@@ -146,11 +145,9 @@ void destringize_f(std::basic_string<CHARTYPE> const & str, Functor const & f)
  * std::istringstream in(str);
  * in >> item1 >> item2 >> item3 ... ;
  * @endcode
- * @NOTE - once we get generic lambdas, we shouldn't need DEWSTRINGIZE() any
- * more since DESTRINGIZE() should do the right thing with a std::wstring. But
- * until then, the lambda we pass must accept the right std::basic_istream.
  */
-#define DESTRINGIZE(STR, EXPRESSION) (destringize_f((STR), [&](std::istream& in){in >> EXPRESSION;}))
-#define DEWSTRINGIZE(STR, EXPRESSION) (destringize_f((STR), [&](std::wistream& in){in >> EXPRESSION;}))
+#define DESTRINGIZE(STR, EXPRESSION) (destringize_f((STR), [&](auto& in){in >> EXPRESSION;}))
+// legacy name, just use DESTRINGIZE() going forward
+#define DEWSTRINGIZE(STR, EXPRESSION) DESTRINGIZE(STR, EXPRESSION)
 
 #endif /* ! defined(LL_STRINGIZE_H) */
author	Nat Goodspeed <nat@lindenlab.com>	2021-10-27 13:01:37 -0400
committer	Nat Goodspeed <nat@lindenlab.com>	2021-10-27 13:01:37 -0400
commit	af5c5a994b90a27e16ef6f2f5044e096269e4217 (patch)
tree	b8b1c55265a02da48b92e2ec412fd79045c836b5
parent	cbaba2df56c66926e051d50b6cb02955c81c2a6c (diff)