/** * @file llstring.cpp * @brief String utility functions and the std::string class. * * $LicenseInfo:firstyear=2001&license=viewerlgpl$ * Second Life Viewer Source Code * Copyright (C) 2010, Linden Research, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License only. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA * $/LicenseInfo$ */ #include "linden_common.h" #include "llstring.h" #include "llerror.h" #include "llfasttimer.h" #include "llsd.h" #include #include #if LL_WINDOWS #include "llwin32headerslean.h" #include // for WideCharToMultiByte #endif std::string ll_safe_string(const char* in) { if(in) return std::string(in); return std::string(); } std::string ll_safe_string(const char* in, S32 maxlen) { if(in && maxlen > 0 ) return std::string(in, maxlen); return std::string(); } bool is_char_hex(char hex) { if((hex >= '0') && (hex <= '9')) { return true; } else if((hex >= 'a') && (hex <='f')) { return true; } else if((hex >= 'A') && (hex <='F')) { return true; } return false; // uh - oh, not hex any more... } U8 hex_as_nybble(char hex) { if((hex >= '0') && (hex <= '9')) { return (U8)(hex - '0'); } else if((hex >= 'a') && (hex <='f')) { return (U8)(10 + hex - 'a'); } else if((hex >= 'A') && (hex <='F')) { return (U8)(10 + hex - 'A'); } return 0; // uh - oh, not hex any more... } bool iswindividual(llwchar elem) { U32 cur_char = (U32)elem; bool result = false; if (0x2E80<= cur_char && cur_char <= 0x9FFF) { result = true; } else if (0xAC00<= cur_char && cur_char <= 0xD7A0 ) { result = true; } else if (0xF900<= cur_char && cur_char <= 0xFA60 ) { result = true; } return result; } bool _read_file_into_string(std::string& str, const std::string& filename) { llifstream ifs(filename.c_str(), llifstream::binary); if (!ifs.is_open()) { LL_INFOS() << "Unable to open file " << filename << LL_ENDL; return false; } std::ostringstream oss; oss << ifs.rdbuf(); str = oss.str(); ifs.close(); return true; } // See http://www.unicode.org/Public/BETA/CVTUTF-1-2/ConvertUTF.c // for the Unicode implementation - this doesn't match because it was written before finding // it. std::ostream& operator<<(std::ostream &s, const LLWString &wstr) { std::string utf8_str = wstring_to_utf8str(wstr); s << utf8_str; return s; } std::string rawstr_to_utf8(const std::string& raw) { LLWString wstr(utf8str_to_wstring(raw)); return wstring_to_utf8str(wstr); } std::ptrdiff_t wchar_to_utf8chars(llwchar in_char, char* outchars) { U32 cur_char = (U32)in_char; char* base = outchars; if (cur_char < 0x80) { *outchars++ = (U8)cur_char; } else if (cur_char < 0x800) { *outchars++ = 0xC0 | (cur_char >> 6); *outchars++ = 0x80 | (cur_char & 0x3F); } else if (cur_char < 0x10000) { *outchars++ = 0xE0 | (cur_char >> 12); *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F); *outchars++ = 0x80 | (cur_char & 0x3F); } else if (cur_char < 0x200000) { *outchars++ = 0xF0 | (cur_char >> 18); *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F); *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F); *outchars++ = 0x80 | (cur_char & 0x3F); } else if (cur_char < 0x4000000) { *outchars++ = 0xF8 | (cur_char >> 24); *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F); *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F); *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F); *outchars++ = 0x80 | (cur_char & 0x3F); } else if (cur_char < 0x80000000) { *outchars++ = 0xFC | (cur_char >> 30); *outchars++ = 0x80 | ((cur_char >> 24) & 0x3F); *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F); *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F); *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F); *outchars++ = 0x80 | (cur_char & 0x3F); } else { LL_WARNS() << "Invalid Unicode character " << cur_char << "!" << LL_ENDL; *outchars++ = LL_UNKNOWN_CHAR; } return outchars - base; } auto utf16chars_to_wchar(const U16* inchars, llwchar* outchar) { const U16* base = inchars; U16 cur_char = *inchars++; llwchar char32 = cur_char; if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF)) { // Surrogates char32 = ((llwchar)(cur_char - 0xD800)) << 10; cur_char = *inchars++; char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL; } else { char32 = (llwchar)cur_char; } *outchar = char32; return inchars - base; } llutf16string wstring_to_utf16str(const llwchar* utf32str, size_t len) { llutf16string out; S32 i = 0; while (i < len) { U32 cur_char = utf32str[i]; if (cur_char > 0xFFFF) { out += (0xD7C0 + (cur_char >> 10)); out += (0xDC00 | (cur_char & 0x3FF)); } else { out += cur_char; } i++; } return out; } llutf16string utf8str_to_utf16str( const char* utf8str, size_t len ) { LLWString wstr = utf8str_to_wstring ( utf8str, len ); return wstring_to_utf16str ( wstr ); } LLWString utf16str_to_wstring(const U16* utf16str, size_t len) { LLWString wout; if (len == 0) return wout; S32 i = 0; const U16* chars16 = utf16str; while (i < len) { llwchar cur_char; i += utf16chars_to_wchar(chars16+i, &cur_char); wout += cur_char; } return wout; } // Length in llwchar (UTF-32) of the first len units (16 bits) of the given UTF-16 string. S32 utf16str_wstring_length(const llutf16string &utf16str, const S32 utf16_len) { S32 surrogate_pairs = 0; // ... craziness to make gcc happy (llutf16string.c_str() is tweaked on linux): const U16 *const utf16_chars = &(*(utf16str.begin())); S32 i = 0; while (i < utf16_len) { const U16 c = utf16_chars[i++]; if (c >= 0xD800 && c <= 0xDBFF) // See http://en.wikipedia.org/wiki/UTF-16 { // Have first byte of a surrogate pair if (i >= utf16_len) { break; } const U16 d = utf16_chars[i]; if (d >= 0xDC00 && d <= 0xDFFF) { // Have valid second byte of a surrogate pair surrogate_pairs++; i++; } } } return utf16_len - surrogate_pairs; } // Length in utf16string (UTF-16) of wlen wchars beginning at woffset. S32 wstring_utf16_length(const LLWString &wstr, const S32 woffset, const S32 wlen) { const S32 end = llmin((S32)wstr.length(), woffset + wlen); if (end < woffset) { return 0; } else { S32 length = end - woffset; for (S32 i = woffset; i < end; i++) { if (wstr[i] >= 0x10000) { length++; } } return length; } } // Given a wstring and an offset in it, returns the length as wstring (i.e., // number of llwchars) of the longest substring that starts at the offset // and whose equivalent utf-16 string does not exceeds the given utf16_length. S32 wstring_wstring_length_from_utf16_length(const LLWString & wstr, const S32 woffset, const S32 utf16_length, BOOL *unaligned) { const auto end = wstr.length(); BOOL u = FALSE; S32 n = woffset + utf16_length; S32 i = woffset; while (i < end) { if (wstr[i] >= 0x10000) { --n; } if (i >= n) { u = (i > n); break; } i++; } if (unaligned) { *unaligned = u; } return i - woffset; } S32 wchar_utf8_length(const llwchar wc) { if (wc < 0x80) { return 1; } else if (wc < 0x800) { return 2; } else if (wc < 0x10000) { return 3; } else if (wc < 0x200000) { return 4; } else if (wc < 0x4000000) { return 5; } else { return 6; } } std::string wchar_utf8_preview(const llwchar wc) { std::ostringstream oss; oss << std::hex << std::uppercase << (U32)wc; U8 out_bytes[8]; U32 size = (U32)wchar_to_utf8chars(wc, (char*)out_bytes); if (size > 1) { oss << " ["; for (U32 i = 0; i < size; ++i) { if (i) { oss << ", "; } oss << (int)out_bytes[i]; } oss << "]"; } return oss.str(); } S32 wstring_utf8_length(const LLWString& wstr) { S32 len = 0; for (S32 i = 0; i < (S32)wstr.length(); i++) { len += wchar_utf8_length(wstr[i]); } return len; } LLWString utf8str_to_wstring(const char* utf8str, size_t len) { LLWString wout; S32 i = 0; while (i < len) { llwchar unichar; U8 cur_char = utf8str[i]; if (cur_char < 0x80) { // Ascii character, just add it unichar = cur_char; } else { S32 cont_bytes = 0; if ((cur_char >> 5) == 0x6) // Two byte UTF8 -> 1 UTF32 { unichar = (0x1F&cur_char); cont_bytes = 1; } else if ((cur_char >> 4) == 0xe) // Three byte UTF8 -> 1 UTF32 { unichar = (0x0F&cur_char); cont_bytes = 2; } else if ((cur_char >> 3) == 0x1e) // Four byte UTF8 -> 1 UTF32 { unichar = (0x07&cur_char); cont_bytes = 3; } else if ((cur_char >> 2) == 0x3e) // Five byte UTF8 -> 1 UTF32 { unichar = (0x03&cur_char); cont_bytes = 4; } else if ((cur_char >> 1) == 0x7e) // Six byte UTF8 -> 1 UTF32 { unichar = (0x01&cur_char); cont_bytes = 5; } else { wout += LL_UNKNOWN_CHAR; ++i; continue; } // Check that this character doesn't go past the end of the string auto end = (len < (i + cont_bytes)) ? len : (i + cont_bytes); do { ++i; cur_char = utf8str[i]; if ( (cur_char >> 6) == 0x2 ) { unichar <<= 6; unichar += (0x3F&cur_char); } else { // Malformed sequence - roll back to look at this as a new char unichar = LL_UNKNOWN_CHAR; --i; break; } } while(i < end); // Handle overlong characters and NULL characters if ( ((cont_bytes == 1) && (unichar < 0x80)) || ((cont_bytes == 2) && (unichar < 0x800)) || ((cont_bytes == 3) && (unichar < 0x10000)) || ((cont_bytes == 4) && (unichar < 0x200000)) || ((cont_bytes == 5) && (unichar < 0x4000000)) ) { unichar = LL_UNKNOWN_CHAR; } } wout += unichar; ++i; } return wout; } std::string wstring_to_utf8str(const llwchar* utf32str, size_t len) { std::string out; S32 i = 0; while (i < len) { char tchars[8]; /* Flawfinder: ignore */ auto n = wchar_to_utf8chars(utf32str[i], tchars); tchars[n] = 0; out += tchars; i++; } return out; } std::string utf16str_to_utf8str(const U16* utf16str, size_t len) { return wstring_to_utf8str(utf16str_to_wstring(utf16str, len)); } std::string utf8str_trim(const std::string& utf8str) { LLWString wstr = utf8str_to_wstring(utf8str); LLWStringUtil::trim(wstr); return wstring_to_utf8str(wstr); } std::string utf8str_tolower(const std::string& utf8str) { LLWString out_str = utf8str_to_wstring(utf8str); LLWStringUtil::toLower(out_str); return wstring_to_utf8str(out_str); } S32 utf8str_compare_insensitive(const std::string& lhs, const std::string& rhs) { LLWString wlhs = utf8str_to_wstring(lhs); LLWString wrhs = utf8str_to_wstring(rhs); return LLWStringUtil::compareInsensitive(wlhs, wrhs); } std::string utf8str_truncate(const std::string& utf8str, const S32 max_len) { if (0 == max_len) { return std::string(); } if ((S32)utf8str.length() <= max_len) { return utf8str; } else { S32 cur_char = max_len; // If we're ASCII, we don't need to do anything if ((U8)utf8str[cur_char] > 0x7f) { // If first two bits are (10), it's the tail end of a multibyte char. We need to shift back // to the first character while (0x80 == (0xc0 & utf8str[cur_char])) { cur_char--; // Keep moving forward until we hit the first char; if (cur_char == 0) { // Make sure we don't trash memory if we've got a bogus string. break; } } } // The byte index we're on is one we want to get rid of, so we only want to copy up to (cur_char-1) chars return utf8str.substr(0, cur_char); } } std::string utf8str_symbol_truncate(const std::string& utf8str, const S32 symbol_len) { if (0 == symbol_len) { return std::string(); } if ((S32)utf8str.length() <= symbol_len) { return utf8str; } else { int len = 0, byteIndex = 0; const char* aStr = utf8str.c_str(); size_t origSize = utf8str.size(); for (byteIndex = 0; len < symbol_len && byteIndex < origSize; byteIndex++) { if ((aStr[byteIndex] & 0xc0) != 0x80) { len += 1; } } return utf8str.substr(0, byteIndex); } } std::string utf8str_substChar( const std::string& utf8str, const llwchar target_char, const llwchar replace_char) { LLWString wstr = utf8str_to_wstring(utf8str); LLWStringUtil::replaceChar(wstr, target_char, replace_char); //wstr = wstring_substChar(wstr, target_char, replace_char); return wstring_to_utf8str(wstr); } std::string utf8str_makeASCII(const std::string& utf8str) { LLWString wstr = utf8str_to_wstring(utf8str); LLWStringUtil::_makeASCII(wstr); return wstring_to_utf8str(wstr); } std::string mbcsstring_makeASCII(const std::string& wstr) { // Replace non-ASCII chars with replace_char std::string out_str = wstr; for (S32 i = 0; i < (S32)out_str.length(); i++) { if ((U8)out_str[i] > 0x7f) { out_str[i] = LL_UNKNOWN_CHAR; } } return out_str; } std::string utf8str_removeCRLF(const std::string& utf8str) { if (0 == utf8str.length()) { return std::string(); } const char CR = 13; std::string out; out.reserve(utf8str.length()); const S32 len = (S32)utf8str.length(); for( S32 i = 0; i < len; i++ ) { if( utf8str[i] != CR ) { out.push_back(utf8str[i]); } } return out; } llwchar utf8str_to_wchar(const std::string& utf8str, size_t offset, size_t length) { switch (length) { case 2: return ((utf8str[offset] & 0x1F) << 6) + (utf8str[offset + 1] & 0x3F); case 3: return ((utf8str[offset] & 0x0F) << 12) + ((utf8str[offset + 1] & 0x3F) << 6) + (utf8str[offset + 2] & 0x3F); case 4: return ((utf8str[offset] & 0x07) << 18) + ((utf8str[offset + 1] & 0x3F) << 12) + ((utf8str[offset + 2] & 0x3F) << 6) + (utf8str[offset + 3] & 0x3F); case 5: return ((utf8str[offset] & 0x03) << 24) + ((utf8str[offset + 1] & 0x3F) << 18) + ((utf8str[offset + 2] & 0x3F) << 12) + ((utf8str[offset + 3] & 0x3F) << 6) + (utf8str[offset + 4] & 0x3F); case 6: return ((utf8str[offset] & 0x01) << 30) + ((utf8str[offset + 1] & 0x3F) << 24) + ((utf8str[offset + 2] & 0x3F) << 18) + ((utf8str[offset + 3] & 0x3F) << 12) + ((utf8str[offset + 4] & 0x3F) << 6) + (utf8str[offset + 5] & 0x3F); case 7: return ((utf8str[offset + 1] & 0x03) << 30) + ((utf8str[offset + 2] & 0x3F) << 24) + ((utf8str[offset + 3] & 0x3F) << 18) + ((utf8str[offset + 4] & 0x3F) << 12) + ((utf8str[offset + 5] & 0x3F) << 6) + (utf8str[offset + 6] & 0x3F); } return LL_UNKNOWN_CHAR; } std::string utf8str_showBytesUTF8(const std::string& utf8str) { std::string result; bool in_sequence = false; size_t sequence_size = 0; size_t byte_index = 0; size_t source_length = utf8str.size(); auto open_sequence = [&]() { if (!result.empty() && result.back() != '\n') result += '\n'; // Use LF as a separator before new UTF-8 sequence result += '['; in_sequence = true; }; auto close_sequence = [&]() { llwchar unicode = utf8str_to_wchar(utf8str, byte_index - sequence_size, sequence_size); if (unicode != LL_UNKNOWN_CHAR) { result += llformat("+%04X", unicode); } result += ']'; in_sequence = false; sequence_size = 0; }; while (byte_index < source_length) { U8 byte = utf8str[byte_index]; if (byte >= 0x80) // Part of an UTF-8 sequence { if (!in_sequence) // Start new UTF-8 sequence { open_sequence(); } else if (byte >= 0xC0) // Start another UTF-8 sequence { close_sequence(); open_sequence(); } else // Continue the same UTF-8 sequence { result += '.'; } result += llformat("%02X", byte); // The byte is represented in hexadecimal form ++sequence_size; } else // ASCII symbol is represented as a character { if (in_sequence) // End of UTF-8 sequence { close_sequence(); if (byte != '\n') { result += '\n'; // Use LF as a separator between UTF-8 and ASCII } } result += byte; } ++byte_index; } if (in_sequence) // End of UTF-8 sequence { close_sequence(); } return result; } // Search for any emoji symbol, return true if found bool wstring_has_emoji(const LLWString& wstr) { for (const llwchar& wch : wstr) { if (LLStringOps::isEmoji(wch)) return true; } return false; } // Cut emoji symbols if exist bool wstring_remove_emojis(LLWString& wstr) { bool found = false; for (size_t i = 0; i < wstr.size(); ++i) { if (LLStringOps::isEmoji(wstr[i])) { wstr.erase(i--, 1); found = true; } } return found; } // Cut emoji symbols if exist bool utf8str_remove_emojis(std::string& utf8str) { LLWString wstr = utf8str_to_wstring(utf8str); if (!wstring_remove_emojis(wstr)) return false; utf8str = wstring_to_utf8str(wstr); return true; } #if LL_WINDOWS unsigned int ll_wstring_default_code_page() { return CP_UTF8; } std::string ll_convert_wide_to_string(const wchar_t* in, size_t len_in, unsigned int code_page) { std::string out; if(in) { int len_out = WideCharToMultiByte( code_page, 0, in, len_in, NULL, 0, 0, 0); // We will need two more bytes for the double NULL ending // created in WideCharToMultiByte(). char* pout = new char [len_out + 2]; memset(pout, 0, len_out + 2); if(pout) { WideCharToMultiByte( code_page, 0, in, len_in, pout, len_out, 0, 0); out.assign(pout); delete[] pout; } } return out; } std::wstring ll_convert_string_to_wide(const char* in, size_t len, unsigned int code_page) { // From review: // We can preallocate a wide char buffer that is the same length (in wchar_t elements) as the utf8 input, // plus one for a null terminator, and be guaranteed to not overflow. // Normally, I'd call that sort of thing premature optimization, // but we *are* seeing string operations taking a bunch of time, especially when constructing widgets. // int output_str_len = MultiByteToWideChar(code_page, 0, in.c_str(), in.length(), NULL, 0); // reserve an output buffer that will be destroyed on exit, with a place // to put NULL terminator std::vector w_out(len + 1); memset(&w_out[0], 0, w_out.size()); int real_output_str_len = MultiByteToWideChar(code_page, 0, in, len, &w_out[0], w_out.size() - 1); //looks like MultiByteToWideChar didn't add null terminator to converted string, see EXT-4858. w_out[real_output_str_len] = 0; // construct string from our temporary output buffer return {&w_out[0]}; } LLWString ll_convert_wide_to_wstring(const wchar_t* in, size_t len) { // Whether or not std::wstring and llutf16string are distinct types, they // both hold UTF-16LE characters. (See header file comments.) Pretend this // wchar_t* sequence is really a U16* sequence and use the conversion we // define above. return utf16str_to_wstring(reinterpret_cast(in), len); } std::wstring ll_convert_wstring_to_wide(const llwchar* in, size_t len) { // first, convert to llutf16string, for which we have a real implementation auto utf16str{ wstring_to_utf16str(in, len) }; // then, because each U16 char must be UTF-16LE encoded, pretend the U16* // string pointer is a wchar_t* and instantiate a std::wstring of the same // length. return { reinterpret_cast(utf16str.c_str()), utf16str.length() }; } std::string ll_convert_string_to_utf8_string(const std::string& in) { // If you pass code_page, you must also pass length, otherwise the code // page parameter will be mistaken for length. auto w_mesg = ll_convert_string_to_wide(in, in.length(), CP_ACP); // CP_UTF8 is default -- see ll_wstring_default_code_page() above. return ll_convert_wide_to_string(w_mesg); } namespace { void HeapFree_deleter(void* ptr) { // instead of LocalFree(), per https://stackoverflow.com/a/31541205 HeapFree(GetProcessHeap(), NULL, ptr); } } // anonymous namespace template<> std::wstring windows_message(DWORD error) { // derived from https://stackoverflow.com/a/455533 wchar_t* rawptr = nullptr; auto okay = FormatMessageW( // use system message tables for GetLastError() codes FORMAT_MESSAGE_FROM_SYSTEM | // internally allocate buffer and return its pointer FORMAT_MESSAGE_ALLOCATE_BUFFER | // you cannot pass insertion parameters (thanks Gandalf) FORMAT_MESSAGE_IGNORE_INSERTS | // ignore line breaks in message definition text FORMAT_MESSAGE_MAX_WIDTH_MASK, NULL, // lpSource, unused with FORMAT_MESSAGE_FROM_SYSTEM error, // dwMessageId MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // dwLanguageId (LPWSTR)&rawptr, // lpBuffer: force-cast wchar_t** to wchar_t* 0, // nSize, unused with FORMAT_MESSAGE_ALLOCATE_BUFFER NULL); // Arguments, unused // make a unique_ptr from rawptr so it gets cleaned up properly std::unique_ptr bufferptr(rawptr, HeapFree_deleter); if (okay && bufferptr) { // got the message, return it ('okay' is length in characters) return { bufferptr.get(), okay }; } // did not get the message, synthesize one auto format_message_error = GetLastError(); std::wostringstream out; out << L"GetLastError() " << error << L" (FormatMessageW() failed with " << format_message_error << L")"; return out.str(); } boost::optional llstring_getoptenv(const std::string& key) { auto wkey = ll_convert_string_to_wide(key); // Take a wild guess as to how big the buffer should be. std::vector buffer(1024); auto n = GetEnvironmentVariableW(wkey.c_str(), &buffer[0], buffer.size()); // If our initial guess was too short, n will indicate the size (in // wchar_t's) that buffer should have been, including the terminating nul. if (n > (buffer.size() - 1)) { // make it big enough buffer.resize(n); // and try again n = GetEnvironmentVariableW(wkey.c_str(), &buffer[0], buffer.size()); } // did that (ultimately) succeed? if (n) { // great, return populated boost::optional return boost::optional(&buffer[0]); } // not successful auto last_error = GetLastError(); // Don't bother warning for NOT_FOUND; that's an expected case if (last_error != ERROR_ENVVAR_NOT_FOUND) { LL_WARNS() << "GetEnvironmentVariableW('" << key << "') failed: " << windows_message(last_error) << LL_ENDL; } // return empty boost::optional return {}; } #else // ! LL_WINDOWS boost::optional llstring_getoptenv(const std::string& key) { auto found = getenv(key.c_str()); if (found) { // return populated boost::optional return boost::optional(found); } else { // return empty boost::optional return {}; } } #endif // ! LL_WINDOWS long LLStringOps::sPacificTimeOffset = 0; long LLStringOps::sLocalTimeOffset = 0; bool LLStringOps::sPacificDaylightTime = 0; std::map LLStringOps::datetimeToCodes; std::vector LLStringOps::sWeekDayList; std::vector LLStringOps::sWeekDayShortList; std::vector LLStringOps::sMonthList; std::vector LLStringOps::sMonthShortList; std::string LLStringOps::sDayFormat; std::string LLStringOps::sAM; std::string LLStringOps::sPM; // static bool LLStringOps::isEmoji(llwchar wch) { int ublock = ublock_getCode(wch); switch (ublock) { case UBLOCK_GENERAL_PUNCTUATION: case UBLOCK_LETTERLIKE_SYMBOLS: case UBLOCK_ARROWS: case UBLOCK_MISCELLANEOUS_TECHNICAL: case UBLOCK_ENCLOSED_ALPHANUMERICS: case UBLOCK_GEOMETRIC_SHAPES: case UBLOCK_MISCELLANEOUS_SYMBOLS: case UBLOCK_DINGBATS: case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS: case UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS: case UBLOCK_EMOTICONS: case UBLOCK_TRANSPORT_AND_MAP_SYMBOLS: #if U_ICU_VERSION_MAJOR_NUM > 56 // Boost uses ICU so we can't update it independently case UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS: #endif // U_ICU_VERSION_MAJOR_NUM > 56 return true; default: #if U_ICU_VERSION_MAJOR_NUM > 56 return false; #else // See https://en.wikipedia.org/wiki/Supplemental_Symbols_and_Pictographs return wch >= 0x1F900 && wch <= 0x1F9FF; #endif // U_ICU_VERSION_MAJOR_NUM > 56 } } S32 LLStringOps::collate(const llwchar* a, const llwchar* b) { #if LL_WINDOWS // in Windows, wide string functions operator on 16-bit strings, // not the proper 32 bit wide string return strcmp(wstring_to_utf8str(LLWString(a)).c_str(), wstring_to_utf8str(LLWString(b)).c_str()); #else return wcscoll(a, b); #endif } void LLStringOps::setupDatetimeInfo (bool daylight) { time_t nowT, localT, gmtT; struct tm * tmpT; nowT = time (NULL); tmpT = gmtime (&nowT); gmtT = mktime (tmpT); tmpT = localtime (&nowT); localT = mktime (tmpT); sLocalTimeOffset = (long) (gmtT - localT); if (tmpT->tm_isdst) { sLocalTimeOffset -= 60 * 60; // 1 hour } sPacificDaylightTime = daylight; sPacificTimeOffset = (sPacificDaylightTime? 7 : 8 ) * 60 * 60; datetimeToCodes["wkday"] = "%a"; // Thu datetimeToCodes["weekday"] = "%A"; // Thursday datetimeToCodes["year4"] = "%Y"; // 2009 datetimeToCodes["year"] = "%Y"; // 2009 datetimeToCodes["year2"] = "%y"; // 09 datetimeToCodes["mth"] = "%b"; // Aug datetimeToCodes["month"] = "%B"; // August datetimeToCodes["mthnum"] = "%m"; // 08 datetimeToCodes["day"] = "%d"; // 31 datetimeToCodes["sday"] = "%-d"; // 9 datetimeToCodes["hour24"] = "%H"; // 14 datetimeToCodes["hour"] = "%H"; // 14 datetimeToCodes["hour12"] = "%I"; // 02 datetimeToCodes["min"] = "%M"; // 59 datetimeToCodes["ampm"] = "%p"; // AM datetimeToCodes["second"] = "%S"; // 59 datetimeToCodes["timezone"] = "%Z"; // PST } void tokenizeStringToArray(const std::string& data, std::vector& output) { output.clear(); size_t length = data.size(); // tokenize it and put it in the array std::string cur_word; for(size_t i = 0; i < length; ++i) { if(data[i] == ':') { output.push_back(cur_word); cur_word.clear(); } else { cur_word.append(1, data[i]); } } output.push_back(cur_word); } void LLStringOps::setupWeekDaysNames(const std::string& data) { tokenizeStringToArray(data,sWeekDayList); } void LLStringOps::setupWeekDaysShortNames(const std::string& data) { tokenizeStringToArray(data,sWeekDayShortList); } void LLStringOps::setupMonthNames(const std::string& data) { tokenizeStringToArray(data,sMonthList); } void LLStringOps::setupMonthShortNames(const std::string& data) { tokenizeStringToArray(data,sMonthShortList); } void LLStringOps::setupDayFormat(const std::string& data) { sDayFormat = data; } std::string LLStringOps::getDatetimeCode (std::string key) { std::map::iterator iter; iter = datetimeToCodes.find (key); if (iter != datetimeToCodes.end()) { return iter->second; } else { return std::string(""); } } std::string LLStringOps::getReadableNumber(F64 num) { if (fabs(num)>=1e9) { return llformat("%.2lfB", num / 1e9); } else if (fabs(num)>=1e6) { return llformat("%.2lfM", num / 1e6); } else if (fabs(num)>=1e3) { return llformat("%.2lfK", num / 1e3); } else { return llformat("%.2lf", num); } } namespace LLStringFn { // NOTE - this restricts output to ascii void replace_nonprintable_in_ascii(std::basic_string& string, char replacement) { const char MIN = 0x20; std::basic_string::size_type len = string.size(); for(std::basic_string::size_type ii = 0; ii < len; ++ii) { if(string[ii] < MIN) { string[ii] = replacement; } } } // NOTE - this restricts output to ascii void replace_nonprintable_and_pipe_in_ascii(std::basic_string& str, char replacement) { const char MIN = 0x20; const char PIPE = 0x7c; std::basic_string::size_type len = str.size(); for(std::basic_string::size_type ii = 0; ii < len; ++ii) { if( (str[ii] < MIN) || (str[ii] == PIPE) ) { str[ii] = replacement; } } } // https://wiki.lindenlab.com/wiki/Unicode_Guidelines has details on // allowable code points for XML. Specifically, they are: // 0x09, 0x0a, 0x0d, and 0x20 on up. JC std::string strip_invalid_xml(const std::string& instr) { std::string output; output.reserve( instr.size() ); std::string::const_iterator it = instr.begin(); while (it != instr.end()) { // Must compare as unsigned for >= // Test most likely match first const unsigned char c = (unsigned char)*it; if ( c >= (unsigned char)0x20 // SPACE || c == (unsigned char)0x09 // TAB || c == (unsigned char)0x0a // LINE_FEED || c == (unsigned char)0x0d ) // CARRIAGE_RETURN { output.push_back(c); } ++it; } return output; } /** * @brief Replace all control characters (c < 0x20) with replacement in * string. */ void replace_ascii_controlchars(std::basic_string& string, char replacement) { const unsigned char MIN = 0x20; std::basic_string::size_type len = string.size(); for(std::basic_string::size_type ii = 0; ii < len; ++ii) { const unsigned char c = (unsigned char) string[ii]; if(c < MIN) { string[ii] = replacement; } } } } //////////////////////////////////////////////////////////// // Forward specialization of LLStringUtil::format before use in LLStringUtil::formatDatetime. template<> S32 LLStringUtil::format(std::string& s, const format_map_t& substitutions); //static template<> void LLStringUtil::getTokens(const std::string& instr, std::vector& tokens, const std::string& delims) { // Starting at offset 0, scan forward for the next non-delimiter. We're // done when the only characters left in 'instr' are delimiters. for (std::string::size_type begIdx, endIdx = 0; (begIdx = instr.find_first_not_of (delims, endIdx)) != std::string::npos; ) { // Found a non-delimiter. After that, find the next delimiter. endIdx = instr.find_first_of (delims, begIdx); if (endIdx == std::string::npos) { // No more delimiters: this token extends to the end of the string. endIdx = instr.length(); } // extract the token between begIdx and endIdx; substr() needs length std::string currToken(instr.substr(begIdx, endIdx - begIdx)); LLStringUtil::trim (currToken); tokens.push_back(currToken); // next scan past delimiters starts at endIdx } } template<> LLStringUtil::size_type LLStringUtil::getSubstitution(const std::string& instr, size_type& start, std::vector& tokens) { const std::string delims (","); // Find the first [ size_type pos1 = instr.find('[', start); if (pos1 == std::string::npos) return std::string::npos; //Find the first ] after the initial [ size_type pos2 = instr.find(']', pos1); if (pos2 == std::string::npos) return std::string::npos; // Find the last [ before ] in case of nested [[]] pos1 = instr.find_last_of('[', pos2-1); if (pos1 == std::string::npos || pos1 < start) return std::string::npos; getTokens(std::string(instr,pos1+1,pos2-pos1-1), tokens, delims); start = pos2+1; return pos1; } // static template<> bool LLStringUtil::simpleReplacement(std::string &replacement, std::string token, const format_map_t& substitutions) { // see if we have a replacement for the bracketed string (without the brackets) // test first using has() because if we just look up with operator[] we get back an // empty string even if the value is missing. We want to distinguish between // missing replacements and deliberately empty replacement strings. format_map_t::const_iterator iter = substitutions.find(token); if (iter != substitutions.end()) { replacement = iter->second; return true; } // if not, see if there's one WITH brackets iter = substitutions.find(std::string("[" + token + "]")); if (iter != substitutions.end()) { replacement = iter->second; return true; } return false; } // static template<> bool LLStringUtil::simpleReplacement(std::string &replacement, std::string token, const LLSD& substitutions) { // see if we have a replacement for the bracketed string (without the brackets) // test first using has() because if we just look up with operator[] we get back an // empty string even if the value is missing. We want to distinguish between // missing replacements and deliberately empty replacement strings. if (substitutions.has(token)) { replacement = substitutions[token].asString(); return true; } // if not, see if there's one WITH brackets else if (substitutions.has(std::string("[" + token + "]"))) { replacement = substitutions[std::string("[" + token + "]")].asString(); return true; } return false; } //static template<> void LLStringUtil::setLocale(std::string inLocale) { sLocale = inLocale; }; //static template<> std::string LLStringUtil::getLocale(void) { return sLocale; }; // static template<> void LLStringUtil::formatNumber(std::string& numStr, std::string decimals) { std::stringstream strStream; S32 intDecimals = 0; convertToS32 (decimals, intDecimals); if (!sLocale.empty()) { // std::locale() throws if the locale is unknown! (EXT-7926) try { strStream.imbue(std::locale(sLocale.c_str())); } catch (const std::exception &) { LL_WARNS_ONCE("Locale") << "Cannot set locale to " << sLocale << LL_ENDL; } } if (!intDecimals) { S32 intStr; if (convertToS32(numStr, intStr)) { strStream << intStr; numStr = strStream.str(); } } else { F32 floatStr; if (convertToF32(numStr, floatStr)) { strStream << std::fixed << std::showpoint << std::setprecision(intDecimals) << floatStr; numStr = strStream.str(); } } } // static template<> bool LLStringUtil::formatDatetime(std::string& replacement, std::string token, std::string param, S32 secFromEpoch) { if (param == "local") // local { secFromEpoch -= LLStringOps::getLocalTimeOffset(); } else if (param != "utc") // slt { secFromEpoch -= LLStringOps::getPacificTimeOffset(); } // if never fell into those two ifs above, param must be utc if (secFromEpoch < 0) secFromEpoch = 0; LLDate datetime((F64)secFromEpoch); std::string code = LLStringOps::getDatetimeCode (token); // special case to handle timezone if (code == "%Z") { if (param == "utc") { replacement = "GMT"; } else if (param == "local") { replacement = ""; // user knows their own timezone } else { #if 0 // EXT-1565 : Zai Lynch, James Linden : 15/Oct/09 // [BSI] Feedback: Viewer clock mentions SLT, but would prefer it to show PST/PDT // "slt" = Second Life Time, which is deprecated. // If not utc or user local time, fallback to Pacific time replacement = LLStringOps::getPacificDaylightTime() ? "PDT" : "PST"; #else // SL-20370 : Steeltoe Linden : 29/Sep/23 // Change "PDT" to "SLT" on menu bar replacement = "SLT"; #endif } return true; } //EXT-7013 //few codes are not suppotred by strtime function (example - weekdays for Japanise) //so use predefined ones //if sWeekDayList is not empty than current locale doesn't support //weekday name. time_t loc_seconds = (time_t) secFromEpoch; if(LLStringOps::sWeekDayList.size() == 7 && code == "%A") { struct tm * gmt = gmtime (&loc_seconds); replacement = LLStringOps::sWeekDayList[gmt->tm_wday]; } else if(LLStringOps::sWeekDayShortList.size() == 7 && code == "%a") { struct tm * gmt = gmtime (&loc_seconds); replacement = LLStringOps::sWeekDayShortList[gmt->tm_wday]; } else if(LLStringOps::sMonthList.size() == 12 && code == "%B") { struct tm * gmt = gmtime (&loc_seconds); replacement = LLStringOps::sMonthList[gmt->tm_mon]; } else if( !LLStringOps::sDayFormat.empty() && code == "%d" ) { struct tm * gmt = gmtime (&loc_seconds); LLStringUtil::format_map_t args; args["[MDAY]"] = llformat ("%d", gmt->tm_mday); replacement = LLStringOps::sDayFormat; LLStringUtil::format(replacement, args); } else if (code == "%-d") { struct tm * gmt = gmtime (&loc_seconds); replacement = llformat ("%d", gmt->tm_mday); // day of the month without leading zero } else if( !LLStringOps::sAM.empty() && !LLStringOps::sPM.empty() && code == "%p" ) { struct tm * gmt = gmtime (&loc_seconds); if(gmt->tm_hour<12) { replacement = LLStringOps::sAM; } else { replacement = LLStringOps::sPM; } } else { replacement = datetime.toHTTPDateString(code); } // *HACK: delete leading zero from hour string in case 'hour12' (code = %I) time format // to show time without leading zero, e.g. 08:16 -> 8:16 (EXT-2738). // We could have used '%l' format instead, but it's not supported by Windows. if(code == "%I" && token == "hour12" && replacement.at(0) == '0') { replacement = replacement.at(1); } return !code.empty(); } // LLStringUtil::format recogizes the following patterns. // All substitutions *must* be encased in []'s in the input string. // The []'s are optional in the substitution map. // [FOO_123] // [FOO,number,precision] // [FOO,datetime,format] // static template<> S32 LLStringUtil::format(std::string& s, const format_map_t& substitutions) { LL_PROFILE_ZONE_SCOPED_CATEGORY_STRING; S32 res = 0; std::string output; std::vector tokens; std::string::size_type start = 0; std::string::size_type prev_start = 0; std::string::size_type key_start = 0; while ((key_start = getSubstitution(s, start, tokens)) != std::string::npos) { output += std::string(s, prev_start, key_start-prev_start); prev_start = start; bool found_replacement = false; std::string replacement; if (tokens.size() == 0) { found_replacement = false; } else if (tokens.size() == 1) { found_replacement = simpleReplacement (replacement, tokens[0], substitutions); } else if (tokens[1] == "number") { std::string param = "0"; if (tokens.size() > 2) param = tokens[2]; found_replacement = simpleReplacement (replacement, tokens[0], substitutions); if (found_replacement) formatNumber (replacement, param); } else if (tokens[1] == "datetime") { std::string param; if (tokens.size() > 2) param = tokens[2]; format_map_t::const_iterator iter = substitutions.find("datetime"); if (iter != substitutions.end()) { S32 secFromEpoch = 0; BOOL r = LLStringUtil::convertToS32(iter->second, secFromEpoch); if (r) { found_replacement = formatDatetime(replacement, tokens[0], param, secFromEpoch); } } } if (found_replacement) { output += replacement; res++; } else { // we had no replacement, use the string as is // e.g. "hello [MISSING_REPLACEMENT]" or "-=[Stylized Name]=-" output += std::string(s, key_start, start-key_start); } tokens.clear(); } // send the remainder of the string (with no further matches for bracketed names) output += std::string(s, start); s = output; return res; } //static template<> S32 LLStringUtil::format(std::string& s, const LLSD& substitutions) { LL_PROFILE_ZONE_SCOPED_CATEGORY_STRING; S32 res = 0; if (!substitutions.isMap()) { return res; } std::string output; std::vector tokens; std::string::size_type start = 0; std::string::size_type prev_start = 0; std::string::size_type key_start = 0; while ((key_start = getSubstitution(s, start, tokens)) != std::string::npos) { output += std::string(s, prev_start, key_start-prev_start); prev_start = start; bool found_replacement = false; std::string replacement; if (tokens.size() == 0) { found_replacement = false; } else if (tokens.size() == 1) { found_replacement = simpleReplacement (replacement, tokens[0], substitutions); } else if (tokens[1] == "number") { std::string param = "0"; if (tokens.size() > 2) param = tokens[2]; found_replacement = simpleReplacement (replacement, tokens[0], substitutions); if (found_replacement) formatNumber (replacement, param); } else if (tokens[1] == "datetime") { std::string param; if (tokens.size() > 2) param = tokens[2]; S32 secFromEpoch = (S32) substitutions["datetime"].asInteger(); found_replacement = formatDatetime (replacement, tokens[0], param, secFromEpoch); } if (found_replacement) { output += replacement; res++; } else { // we had no replacement, use the string as is // e.g. "hello [MISSING_REPLACEMENT]" or "-=[Stylized Name]=-" output += std::string(s, key_start, start-key_start); } tokens.clear(); } // send the remainder of the string (with no further matches for bracketed names) output += std::string(s, start); s = output; return res; } //////////////////////////////////////////////////////////// // Testing #ifdef _DEBUG template void LLStringUtilBase::testHarness() { std::string s1; llassert( s1.c_str() == NULL ); llassert( s1.size() == 0 ); llassert( s1.empty() ); std::string s2( "hello"); llassert( !strcmp( s2.c_str(), "hello" ) ); llassert( s2.size() == 5 ); llassert( !s2.empty() ); std::string s3( s2 ); llassert( "hello" == s2 ); llassert( s2 == "hello" ); llassert( s2 > "gello" ); llassert( "gello" < s2 ); llassert( "gello" != s2 ); llassert( s2 != "gello" ); std::string s4 = s2; llassert( !s4.empty() ); s4.empty(); llassert( s4.empty() ); std::string s5(""); llassert( s5.empty() ); llassert( isValidIndex(s5, 0) ); llassert( !isValidIndex(s5, 1) ); s3 = s2; s4 = "hello again"; s4 += "!"; s4 += s4; llassert( s4 == "hello again!hello again!" ); std::string s6 = s2 + " " + s2; std::string s7 = s6; llassert( s6 == s7 ); llassert( !( s6 != s7) ); llassert( !(s6 < s7) ); llassert( !(s6 > s7) ); llassert( !(s6 == "hi")); llassert( s6 == "hello hello"); llassert( s6 < "hi"); llassert( s6[1] == 'e' ); s6[1] = 'f'; llassert( s6[1] == 'f' ); s2.erase( 4, 1 ); llassert( s2 == "hell"); s2.insert( 0, "y" ); llassert( s2 == "yhell"); s2.erase( 1, 3 ); llassert( s2 == "yl"); s2.insert( 1, "awn, don't yel"); llassert( s2 == "yawn, don't yell"); std::string s8 = s2.substr( 6, 5 ); llassert( s8 == "don't" ); std::string s9 = " \t\ntest \t\t\n "; trim(s9); llassert( s9 == "test" ); s8 = "abc123&*(ABC"; s9 = s8; toUpper(s9); llassert( s9 == "ABC123&*(ABC" ); s9 = s8; toLower(s9); llassert( s9 == "abc123&*(abc" ); std::string s10( 10, 'x' ); llassert( s10 == "xxxxxxxxxx" ); std::string s11( "monkey in the middle", 7, 2 ); llassert( s11 == "in" ); std::string s12; //empty s12 += "foo"; llassert( s12 == "foo" ); std::string s13; //empty s13 += 'f'; llassert( s13 == "f" ); } #endif // _DEBUG