diff options
Diffstat (limited to 'indra')
-rw-r--r-- | indra/llcommon/llstring.cpp | 16 | ||||
-rw-r--r-- | indra/llcommon/llstring.h | 355 | ||||
-rw-r--r-- | indra/llcommon/tests/StringVec.h | 37 | ||||
-rw-r--r-- | indra/llcommon/tests/listener.h | 21 | ||||
-rw-r--r-- | indra/llcommon/tests/llstring_test.cpp | 115 |
5 files changed, 517 insertions, 27 deletions
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp index e7fe656808..fa0eb9f72c 100644 --- a/indra/llcommon/llstring.cpp +++ b/indra/llcommon/llstring.cpp @@ -912,22 +912,24 @@ S32 LLStringUtil::format(std::string& s, const format_map_t& substitutions); template<> void LLStringUtil::getTokens(const std::string& instr, std::vector<std::string >& tokens, const std::string& delims) { - std::string currToken; - std::string::size_type begIdx, endIdx; - - begIdx = instr.find_first_not_of (delims); - while (begIdx != std::string::npos) + // Starting at offset 0, scan forward for the next non-delimiter. We're + // done when the only characters left in 'instr' are delimiters. + for (std::string::size_type begIdx, endIdx = 0; + (begIdx = instr.find_first_not_of (delims, endIdx)) != std::string::npos; ) { + // Found a non-delimiter. After that, find the next delimiter. endIdx = instr.find_first_of (delims, begIdx); if (endIdx == std::string::npos) { + // No more delimiters: this token extends to the end of the string. endIdx = instr.length(); } - currToken = instr.substr(begIdx, endIdx - begIdx); + // extract the token between begIdx and endIdx; substr() needs length + std::string currToken(instr.substr(begIdx, endIdx - begIdx)); LLStringUtil::trim (currToken); tokens.push_back(currToken); - begIdx = instr.find_first_not_of (delims, endIdx); + // next scan past delimiters starts at endIdx } } diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h index 7b24b5e279..e4ae54cec5 100644 --- a/indra/llcommon/llstring.h +++ b/indra/llcommon/llstring.h @@ -40,6 +40,7 @@ #endif #include <string.h> +#include <boost/scoped_ptr.hpp> #if LL_SOLARIS // stricmp and strnicmp do not exist on Solaris: @@ -247,7 +248,38 @@ public: static const string_type null; typedef std::map<LLFormatMapString, LLFormatMapString> format_map_t; - LL_COMMON_API static void getTokens(const string_type& instr, std::vector<string_type >& tokens, const string_type& delims); + /// considers any sequence of delims as a single field separator + LL_COMMON_API static void getTokens(const string_type& instr, + std::vector<string_type >& tokens, + const string_type& delims); + /// like simple scan overload, but returns scanned vector + LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr, + const string_type& delims); + /// add support for keep_delims and quotes (either could be empty string) + LL_COMMON_API static void getTokens(const string_type& instr, + std::vector<string_type>& tokens, + const string_type& drop_delims, + const string_type& keep_delims, + const string_type& quotes=string_type()); + /// like keep_delims-and-quotes overload, but returns scanned vector + LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr, + const string_type& drop_delims, + const string_type& keep_delims, + const string_type& quotes=string_type()); + /// add support for escapes (could be empty string) + LL_COMMON_API static void getTokens(const string_type& instr, + std::vector<string_type>& tokens, + const string_type& drop_delims, + const string_type& keep_delims, + const string_type& quotes, + const string_type& escapes); + /// like escapes overload, but returns scanned vector + LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr, + const string_type& drop_delims, + const string_type& keep_delims, + const string_type& quotes, + const string_type& escapes); + LL_COMMON_API static void formatNumber(string_type& numStr, string_type decimals); LL_COMMON_API static bool formatDatetime(string_type& replacement, string_type token, string_type param, S32 secFromEpoch); LL_COMMON_API static S32 format(string_type& s, const format_map_t& substitutions); @@ -262,6 +294,11 @@ public: return !string.empty() && (0 <= i) && (i <= string.size()); } + static bool contains(const string_type& string, T c, size_type i=0) + { + return string.find(c, i) != string_type::npos; + } + static void trimHead(string_type& string); static void trimTail(string_type& string); static void trim(string_type& string) { trimHead(string); trimTail(string); } @@ -650,10 +687,324 @@ namespace LLStringFn //////////////////////////////////////////////////////////// // NOTE: LLStringUtil::format, getTokens, and support functions moved to llstring.cpp. // There is no LLWStringUtil::format implementation currently. -// Calling thse for anything other than LLStringUtil will produce link errors. +// Calling these for anything other than LLStringUtil will produce link errors. //////////////////////////////////////////////////////////// +// static +template <class T> +std::vector<typename LLStringUtilBase<T>::string_type> +LLStringUtilBase<T>::getTokens(const string_type& instr, const string_type& delims) +{ + std::vector<string_type> tokens; + getTokens(instr, tokens, delims); + return tokens; +} + +// static +template <class T> +std::vector<typename LLStringUtilBase<T>::string_type> +LLStringUtilBase<T>::getTokens(const string_type& instr, + const string_type& drop_delims, + const string_type& keep_delims, + const string_type& quotes) +{ + std::vector<string_type> tokens; + getTokens(instr, tokens, drop_delims, keep_delims, quotes); + return tokens; +} + +// static +template <class T> +std::vector<typename LLStringUtilBase<T>::string_type> +LLStringUtilBase<T>::getTokens(const string_type& instr, + const string_type& drop_delims, + const string_type& keep_delims, + const string_type& quotes, + const string_type& escapes) +{ + std::vector<string_type> tokens; + getTokens(instr, tokens, drop_delims, keep_delims, quotes, escapes); + return tokens; +} + +namespace LLStringUtilBaseImpl +{ + +/** + * Input string scanner helper for getTokens(), or really any other + * character-parsing routine that may have to deal with escape characters. + * This implementation defines the concept (also an interface, should you + * choose to implement the concept by subclassing) and provides trivial + * implementations for a string @em without escape processing. + */ +template <class T> +struct InString +{ + typedef std::basic_string<T> string_type; + typedef typename string_type::const_iterator const_iterator; + + InString(const_iterator b, const_iterator e): + iter(b), + end(e) + {} + + bool done() const { return iter == end; } + /// Is the current character (*iter) escaped? This implementation can + /// answer trivially because it doesn't support escapes. + virtual bool escaped() const { return false; } + /// Obtain the current character and advance @c iter. + virtual T next() { return *iter++; } + /// Does the current character match specified character? + virtual bool is(T ch) const { return (! done()) && *iter == ch; } + /// Is the current character any one of the specified characters? + virtual bool oneof(const string_type& delims) const + { + return (! done()) && LLStringUtilBase<T>::contains(delims, *iter); + } + + /** + * Scan forward from @from until either @a delim or end. This is primarily + * useful for processing quoted substrings. + * + * If we do see @a delim, append everything from @from until (excluding) + * @a delim to @a into, advance @c iter to skip @a delim, and return @c + * true. + * + * If we do not see @a delim, do not alter @a into or @c iter and return + * @c false. Do not pass GO, do not collect $200. + * + * @note The @c false case described above implements normal getTokens() + * treatment of an unmatched open quote: treat the quote character as if + * escaped, that is, simply collect it as part of the current token. Other + * plausible behaviors directly affect the way getTokens() deals with an + * unmatched quote: e.g. throwing an exception to treat it as an error, or + * assuming a close quote beyond end of string (in which case return @c + * true). + */ + virtual bool collect_until(string_type& into, const_iterator from, T delim) + { + const_iterator found = std::find(from, end, delim); + // If we didn't find delim, change nothing, just tell caller. + if (found == end) + return false; + // Found delim! Append everything between from and found. + into.append(from, found); + // advance past delim in input + iter = found + 1; + return true; + } + + const_iterator iter, end; +}; + +/// InString subclass that handles escape characters +template <class T> +class InEscString: public InString<T> +{ +public: + typedef InString<T> super; + typedef typename super::string_type string_type; + typedef typename super::const_iterator const_iterator; + using super::done; + using super::iter; + using super::end; + + InEscString(const_iterator b, const_iterator e, const string_type& escapes_): + super(b, e), + escapes(escapes_) + { + // Even though we've already initialized 'iter' via our base-class + // constructor, set it again to check for initial escape char. + setiter(b); + } + + /// This implementation uses the answer cached by setiter(). + virtual bool escaped() const { return isesc; } + virtual T next() + { + // If we're looking at the escape character of an escape sequence, + // skip that character. This is the one time we can modify 'iter' + // without using setiter: for this one case we DO NOT CARE if the + // escaped character is itself an escape. + if (isesc) + ++iter; + // If we were looking at an escape character, this is the escaped + // character; otherwise it's just the next character. + T result(*iter); + // Advance iter, checking for escape sequence. + setiter(iter + 1); + return result; + } + + virtual bool is(T ch) const + { + // Like base-class is(), except that an escaped character matches + // nothing. + return (! done()) && (! isesc) && *iter == ch; + } + + virtual bool oneof(const string_type& delims) const + { + // Like base-class oneof(), except that an escaped character matches + // nothing. + return (! done()) && (! isesc) && LLStringUtilBase<T>::contains(delims, *iter); + } + + virtual bool collect_until(string_type& into, const_iterator from, T delim) + { + // Deal with escapes in the characters we collect; that is, an escaped + // character must become just that character without the preceding + // escape. Collect characters in a separate string rather than + // directly appending to 'into' in case we do not find delim, in which + // case we're supposed to leave 'into' unmodified. + string_type collected; + // For scanning purposes, we're going to work directly with 'iter'. + // Save its current value in case we fail to see delim. + const_iterator save_iter(iter); + // Okay, set 'iter', checking for escape. + setiter(from); + while (! done()) + { + // If we see an unescaped delim, stop and report success. + if ((! isesc) && *iter == delim) + { + // Append collected chars to 'into'. + into.append(collected); + // Don't forget to advance 'iter' past delim. + setiter(iter + 1); + return true; + } + // We're not at end, and either we're not looking at delim or it's + // escaped. Collect this character and keep going. + collected.push_back(next()); + } + // Here we hit 'end' without ever seeing delim. Restore iter and tell + // caller. + setiter(save_iter); + return false; + } + +private: + void setiter(const_iterator i) + { + iter = i; + + // Every time we change 'iter', set 'isesc' to be able to repetitively + // answer escaped() without having to rescan 'escapes'. isesc caches + // contains(escapes, *iter). + + // We're looking at an escaped char if we're not already at end (that + // is, *iter is even meaningful); if *iter is in fact one of the + // specified escape characters; and if there's one more character + // following it. That is, if an escape character is the very last + // character of the input string, it loses its special meaning. + isesc = (! done()) && + LLStringUtilBase<T>::contains(escapes, *iter) && + (iter+1) != end; + } + + const string_type escapes; + bool isesc; +}; + +/// getTokens() implementation based on InString concept +template <typename INSTRING, typename string_type> +void getTokens(INSTRING& instr, std::vector<string_type>& tokens, + const string_type& drop_delims, const string_type& keep_delims, + const string_type& quotes) +{ + // There are times when we want to match either drop_delims or + // keep_delims. Concatenate them up front to speed things up. + string_type all_delims(drop_delims + keep_delims); + // no tokens yet + tokens.clear(); + + // try for another token + while (! instr.done()) + { + // scan past any drop_delims + while (instr.oneof(drop_delims)) + { + // skip this drop_delim + instr.next(); + // but if that was the end of the string, done + if (instr.done()) + return; + } + // found the start of another token: make a slot for it + tokens.push_back(string_type()); + if (instr.oneof(keep_delims)) + { + // *iter is a keep_delim, a token of exactly 1 character. Append + // that character to the new token and proceed. + tokens.back().push_back(instr.next()); + continue; + } + // Here we have a non-delimiter token, which might consist of a mix of + // quoted and unquoted parts. Use bash rules for quoting: you can + // embed a quoted substring in the midst of an unquoted token (e.g. + // ~/"sub dir"/myfile.txt); you can ram two quoted substrings together + // to make a single token (e.g. 'He said, "'"Don't."'"'). We diverge + // from bash in that bash considers an unmatched quote an error. Our + // param signature doesn't allow for errors, so just pretend it's not + // a quote and embed it. + // At this level, keep scanning until we hit the next delimiter of + // either type (drop_delims or keep_delims). + while (! instr.oneof(all_delims)) + { + // If we're looking at an open quote, search forward for + // a close quote, collecting characters along the way. + if (instr.oneof(quotes) && + instr.collect_until(tokens.back(), instr.iter+1, *instr.iter)) + { + // collect_until is cleverly designed to do exactly what we + // need here. No further action needed if it returns true. + } + else + { + // Either *iter isn't a quote, or there's no matching close + // quote: in other words, just an ordinary char. Append it to + // current token. + tokens.back().push_back(instr.next()); + } + // having scanned that segment of this token, if we've reached the + // end of the string, we're done + if (instr.done()) + return; + } + } +} + +} // namespace LLStringUtilBaseImpl + +// static +template <class T> +void LLStringUtilBase<T>::getTokens(const string_type& string, std::vector<string_type>& tokens, + const string_type& drop_delims, const string_type& keep_delims, + const string_type& quotes) +{ + // Because this overload doesn't support escapes, use simple InString to + // manage input range. + LLStringUtilBaseImpl::InString<T> instring(string.begin(), string.end()); + LLStringUtilBaseImpl::getTokens(instring, tokens, drop_delims, keep_delims, quotes); +} + +// static +template <class T> +void LLStringUtilBase<T>::getTokens(const string_type& string, std::vector<string_type>& tokens, + const string_type& drop_delims, const string_type& keep_delims, + const string_type& quotes, const string_type& escapes) +{ + // This overload must deal with escapes. Delegate that to InEscString + // (unless there ARE no escapes). + boost::scoped_ptr< LLStringUtilBaseImpl::InString<T> > instrp; + if (escapes.empty()) + instrp.reset(new LLStringUtilBaseImpl::InString<T>(string.begin(), string.end())); + else + instrp.reset(new LLStringUtilBaseImpl::InEscString<T>(string.begin(), string.end(), escapes)); + LLStringUtilBaseImpl::getTokens(*instrp, tokens, drop_delims, keep_delims, quotes); +} // static template<class T> diff --git a/indra/llcommon/tests/StringVec.h b/indra/llcommon/tests/StringVec.h new file mode 100644 index 0000000000..a380b00a05 --- /dev/null +++ b/indra/llcommon/tests/StringVec.h @@ -0,0 +1,37 @@ +/** + * @file StringVec.h + * @author Nat Goodspeed + * @date 2012-02-24 + * @brief Extend TUT ensure_equals() to handle std::vector<std::string> + * + * $LicenseInfo:firstyear=2012&license=viewerlgpl$ + * Copyright (c) 2012, Linden Research, Inc. + * $/LicenseInfo$ + */ + +#if ! defined(LL_STRINGVEC_H) +#define LL_STRINGVEC_H + +#include <vector> +#include <string> +#include <iostream> + +typedef std::vector<std::string> StringVec; + +std::ostream& operator<<(std::ostream& out, const StringVec& strings) +{ + out << '('; + StringVec::const_iterator begin(strings.begin()), end(strings.end()); + if (begin != end) + { + out << '"' << *begin << '"'; + while (++begin != end) + { + out << ", \"" << *begin << '"'; + } + } + out << ')'; + return out; +} + +#endif /* ! defined(LL_STRINGVEC_H) */ diff --git a/indra/llcommon/tests/listener.h b/indra/llcommon/tests/listener.h index dcdb2412be..9c5c18a150 100644 --- a/indra/llcommon/tests/listener.h +++ b/indra/llcommon/tests/listener.h @@ -30,6 +30,8 @@ #define LL_LISTENER_H #include "llsd.h" +#include "llevents.h" +#include "tests/StringVec.h" #include <iostream> /***************************************************************************** @@ -133,24 +135,7 @@ struct Collect return false; } void clear() { result.clear(); } - typedef std::vector<std::string> StringList; - StringList result; + StringVec result; }; -std::ostream& operator<<(std::ostream& out, const Collect::StringList& strings) -{ - out << '('; - Collect::StringList::const_iterator begin(strings.begin()), end(strings.end()); - if (begin != end) - { - out << '"' << *begin << '"'; - while (++begin != end) - { - out << ", \"" << *begin << '"'; - } - } - out << ')'; - return out; -} - #endif /* ! defined(LL_LISTENER_H) */ diff --git a/indra/llcommon/tests/llstring_test.cpp b/indra/llcommon/tests/llstring_test.cpp index 6a1cbf652a..821deeac21 100644 --- a/indra/llcommon/tests/llstring_test.cpp +++ b/indra/llcommon/tests/llstring_test.cpp @@ -29,7 +29,11 @@ #include "linden_common.h" #include "../test/lltut.h" +#include <boost/assign/list_of.hpp> #include "../llstring.h" +#include "StringVec.h" + +using boost::assign::list_of; namespace tut { @@ -750,4 +754,115 @@ namespace tut ensure("empty substr.", !LLStringUtil::endsWith(empty, value)); ensure("empty everything.", !LLStringUtil::endsWith(empty, empty)); } + + template<> template<> + void string_index_object_t::test<41>() + { + set_test_name("getTokens(\"delims\")"); + ensure_equals("empty string", LLStringUtil::getTokens("", " "), StringVec()); + ensure_equals("only delims", + LLStringUtil::getTokens(" \r\n ", " \r\n"), StringVec()); + ensure_equals("sequence of delims", + LLStringUtil::getTokens(",,, one ,,,", ","), list_of("one")); + // nat considers this a dubious implementation side effect, but I'd + // hate to change it now... + ensure_equals("noncontiguous tokens", + LLStringUtil::getTokens(", ,, , one ,,,", ","), list_of("")("")("one")); + ensure_equals("space-padded tokens", + LLStringUtil::getTokens(", one , two ,", ","), list_of("one")("two")); + ensure_equals("no delims", LLStringUtil::getTokens("one", ","), list_of("one")); + } + + // Shorthand for verifying that getTokens() behaves the same when you + // don't pass a string of escape characters, when you pass an empty string + // (different overloads), and when you pass a string of characters that + // aren't actually present. + void ensure_getTokens(const std::string& desc, + const std::string& string, + const std::string& drop_delims, + const std::string& keep_delims, + const std::string& quotes, + const std::vector<std::string>& expect) + { + ensure_equals(desc + " - no esc", + LLStringUtil::getTokens(string, drop_delims, keep_delims, quotes), + expect); + ensure_equals(desc + " - empty esc", + LLStringUtil::getTokens(string, drop_delims, keep_delims, quotes, ""), + expect); + ensure_equals(desc + " - unused esc", + LLStringUtil::getTokens(string, drop_delims, keep_delims, quotes, "!"), + expect); + } + + void ensure_getTokens(const std::string& desc, + const std::string& string, + const std::string& drop_delims, + const std::string& keep_delims, + const std::vector<std::string>& expect) + { + ensure_getTokens(desc, string, drop_delims, keep_delims, "", expect); + } + + template<> template<> + void string_index_object_t::test<42>() + { + set_test_name("getTokens(\"delims\", etc.)"); + // Signatures to test in this method: + // getTokens(string, drop_delims, keep_delims [, quotes [, escapes]]) + // If you omit keep_delims, you get the older function (test above). + + // cases like the getTokens(string, delims) tests above + ensure_getTokens("empty string", "", " ", "", StringVec()); + ensure_getTokens("only delims", + " \r\n ", " \r\n", "", StringVec()); + ensure_getTokens("sequence of delims", + ",,, one ,,,", ", ", "", list_of("one")); + // Note contrast with the case in the previous method + ensure_getTokens("noncontiguous tokens", + ", ,, , one ,,,", ", ", "", list_of("one")); + ensure_getTokens("space-padded tokens", + ", one , two ,", ", ", "", + list_of("one")("two")); + ensure_getTokens("no delims", "one", ",", "", list_of("one")); + + // drop_delims vs. keep_delims + ensure_getTokens("arithmetic", + " ab+def / xx* yy ", " ", "+-*/", + list_of("ab")("+")("def")("/")("xx")("*")("yy")); + + // quotes + ensure_getTokens("no quotes", + "She said, \"Don't go.\"", " ", ",", "", + list_of("She")("said")(",")("\"Don't")("go.\"")); + ensure_getTokens("quotes", + "She said, \"Don't go.\"", " ", ",", "\"", + list_of("She")("said")(",")("Don't go.")); + ensure_getTokens("quotes and delims", + "run c:/'Documents and Settings'/someone", " ", "", "'", + list_of("run")("c:/Documents and Settings/someone")); + ensure_getTokens("unmatched quote", + "baby don't leave", " ", "", "'", + list_of("baby")("don't")("leave")); + ensure_getTokens("adjacent quoted", + "abc'def \"ghi'\"jkl' mno\"pqr", " ", "", "\"'", + list_of("abcdef \"ghijkl' mnopqr")); + + // escapes + // Don't use backslash as an escape for these tests -- you'll go nuts + // between the C++ string scanner and getTokens() escapes. Test with + // something else! + ensure_equals("escaped delims", + LLStringUtil::getTokens("^ a - dog^-gone^ phrase", " ", "-", "", "^"), + list_of(" a")("-")("dog-gone phrase")); + ensure_equals("escaped quotes", + LLStringUtil::getTokens("say: 'this isn^'t w^orking'.", " ", "", "'", "^"), + list_of("say:")("this isn't working.")); + ensure_equals("escaped escape", + LLStringUtil::getTokens("want x^^2", " ", "", "", "^"), + list_of("want")("x^2")); + ensure_equals("escape at end", + LLStringUtil::getTokens("it's^ up there^", " ", "", "'", "^"), + list_of("it's up")("there^")); + } } |