5 files changed, 517 insertions, 27 deletions
diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp
index e7fe656808..fa0eb9f72c 100644
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -912,22 +912,24 @@ S32 LLStringUtil::format(std::string& s, const format_map_t& substitutions);
 template<> 
 void LLStringUtil::getTokens(const std::string& instr, std::vector<std::string >& tokens, const std::string& delims)
 {
-	std::string currToken;
-	std::string::size_type begIdx, endIdx;
-
-	begIdx = instr.find_first_not_of (delims);
-	while (begIdx != std::string::npos)
+	// Starting at offset 0, scan forward for the next non-delimiter. We're
+	// done when the only characters left in 'instr' are delimiters.
+	for (std::string::size_type begIdx, endIdx = 0;
+		 (begIdx = instr.find_first_not_of (delims, endIdx)) != std::string::npos; )
 	{
+		// Found a non-delimiter. After that, find the next delimiter.
 		endIdx = instr.find_first_of (delims, begIdx);
 		if (endIdx == std::string::npos)
 		{
+			// No more delimiters: this token extends to the end of the string.
 			endIdx = instr.length();
 		}
 
-		currToken = instr.substr(begIdx, endIdx - begIdx);
+		// extract the token between begIdx and endIdx; substr() needs length
+		std::string currToken(instr.substr(begIdx, endIdx - begIdx));
 		LLStringUtil::trim (currToken);
 		tokens.push_back(currToken);
-		begIdx = instr.find_first_not_of (delims, endIdx);
+		// next scan past delimiters starts at endIdx
 	}
 }
 
diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h
index 7b24b5e279..e4ae54cec5 100644
--- a/indra/llcommon/llstring.h
+++ b/indra/llcommon/llstring.h
@@ -40,6 +40,7 @@
 #endif
 
 #include <string.h>
+#include <boost/scoped_ptr.hpp>
 
 #if LL_SOLARIS
 // stricmp and strnicmp do not exist on Solaris:
@@ -247,7 +248,38 @@ public:
 	static const string_type null;
 	
 	typedef std::map<LLFormatMapString, LLFormatMapString> format_map_t;
-	LL_COMMON_API static void getTokens(const string_type& instr, std::vector<string_type >& tokens, const string_type& delims);
+	/// considers any sequence of delims as a single field separator
+	LL_COMMON_API static void getTokens(const string_type& instr,
+										std::vector<string_type >& tokens,
+										const string_type& delims);
+	/// like simple scan overload, but returns scanned vector
+	LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr,
+															const string_type& delims);
+	/// add support for keep_delims and quotes (either could be empty string)
+	LL_COMMON_API static void getTokens(const string_type& instr,
+										std::vector<string_type>& tokens,
+										const string_type& drop_delims,
+										const string_type& keep_delims,
+										const string_type& quotes=string_type());
+	/// like keep_delims-and-quotes overload, but returns scanned vector
+	LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr,
+															const string_type& drop_delims,
+															const string_type& keep_delims,
+															const string_type& quotes=string_type());
+	/// add support for escapes (could be empty string)
+	LL_COMMON_API static void getTokens(const string_type& instr,
+										std::vector<string_type>& tokens,
+										const string_type& drop_delims,
+										const string_type& keep_delims,
+										const string_type& quotes,
+										const string_type& escapes);
+	/// like escapes overload, but returns scanned vector
+	LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr,
+															const string_type& drop_delims,
+															const string_type& keep_delims,
+															const string_type& quotes,
+															const string_type& escapes);
+
 	LL_COMMON_API static void formatNumber(string_type& numStr, string_type decimals);
 	LL_COMMON_API static bool formatDatetime(string_type& replacement, string_type token, string_type param, S32 secFromEpoch);
 	LL_COMMON_API static S32 format(string_type& s, const format_map_t& substitutions);
@@ -262,6 +294,11 @@ public:
 		return !string.empty() && (0 <= i) && (i <= string.size());
 	}
 
+	static bool contains(const string_type& string, T c, size_type i=0)
+	{
+		return string.find(c, i) != string_type::npos;
+	}
+
 	static void	trimHead(string_type& string);
 	static void	trimTail(string_type& string);
 	static void	trim(string_type& string)	{ trimHead(string); trimTail(string); }
@@ -650,10 +687,324 @@ namespace LLStringFn
 ////////////////////////////////////////////////////////////
 // NOTE: LLStringUtil::format, getTokens, and support functions moved to llstring.cpp.
 // There is no LLWStringUtil::format implementation currently.
-// Calling thse for anything other than LLStringUtil will produce link errors.
+// Calling these for anything other than LLStringUtil will produce link errors.
 
 ////////////////////////////////////////////////////////////
 
+// static
+template <class T>
+std::vector<typename LLStringUtilBase<T>::string_type>
+LLStringUtilBase<T>::getTokens(const string_type& instr, const string_type& delims)
+{
+	std::vector<string_type> tokens;
+	getTokens(instr, tokens, delims);
+	return tokens;
+}
+
+// static
+template <class T>
+std::vector<typename LLStringUtilBase<T>::string_type>
+LLStringUtilBase<T>::getTokens(const string_type& instr,
+							   const string_type& drop_delims,
+							   const string_type& keep_delims,
+							   const string_type& quotes)
+{
+	std::vector<string_type> tokens;
+	getTokens(instr, tokens, drop_delims, keep_delims, quotes);
+	return tokens;
+}
+
+// static
+template <class T>
+std::vector<typename LLStringUtilBase<T>::string_type>
+LLStringUtilBase<T>::getTokens(const string_type& instr,
+							   const string_type& drop_delims,
+							   const string_type& keep_delims,
+							   const string_type& quotes,
+							   const string_type& escapes)
+{
+	std::vector<string_type> tokens;
+	getTokens(instr, tokens, drop_delims, keep_delims, quotes, escapes);
+	return tokens;
+}
+
+namespace LLStringUtilBaseImpl
+{
+
+/**
+ * Input string scanner helper for getTokens(), or really any other
+ * character-parsing routine that may have to deal with escape characters.
+ * This implementation defines the concept (also an interface, should you
+ * choose to implement the concept by subclassing) and provides trivial
+ * implementations for a string @em without escape processing.
+ */
+template <class T>
+struct InString
+{
+	typedef std::basic_string<T> string_type;
+	typedef typename string_type::const_iterator const_iterator;
+
+	InString(const_iterator b, const_iterator e):
+		iter(b),
+		end(e)
+	{}
+
+	bool done() const { return iter == end; }
+	/// Is the current character (*iter) escaped? This implementation can
+	/// answer trivially because it doesn't support escapes.
+	virtual bool escaped() const { return false; }
+	/// Obtain the current character and advance @c iter.
+	virtual T next() { return *iter++; }
+	/// Does the current character match specified character?
+	virtual bool is(T ch) const { return (! done()) && *iter == ch; }
+	/// Is the current character any one of the specified characters?
+	virtual bool oneof(const string_type& delims) const
+	{
+		return (! done()) && LLStringUtilBase<T>::contains(delims, *iter);
+	}
+
+	/**
+	 * Scan forward from @from until either @a delim or end. This is primarily
+	 * useful for processing quoted substrings.
+	 *
+	 * If we do see @a delim, append everything from @from until (excluding)
+	 * @a delim to @a into, advance @c iter to skip @a delim, and return @c
+	 * true.
+	 *
+	 * If we do not see @a delim, do not alter @a into or @c iter and return
+	 * @c false. Do not pass GO, do not collect $200.
+	 *
+	 * @note The @c false case described above implements normal getTokens()
+	 * treatment of an unmatched open quote: treat the quote character as if
+	 * escaped, that is, simply collect it as part of the current token. Other
+	 * plausible behaviors directly affect the way getTokens() deals with an
+	 * unmatched quote: e.g. throwing an exception to treat it as an error, or
+	 * assuming a close quote beyond end of string (in which case return @c
+	 * true).
+	 */
+	virtual bool collect_until(string_type& into, const_iterator from, T delim)
+	{
+		const_iterator found = std::find(from, end, delim);
+		// If we didn't find delim, change nothing, just tell caller.
+		if (found == end)
+			return false;
+		// Found delim! Append everything between from and found.
+		into.append(from, found);
+		// advance past delim in input
+		iter = found + 1;
+		return true;
+	}
+
+	const_iterator iter, end;
+};
+
+/// InString subclass that handles escape characters
+template <class T>
+class InEscString: public InString<T>
+{
+public:
+	typedef InString<T> super;
+	typedef typename super::string_type string_type;
+	typedef typename super::const_iterator const_iterator;
+	using super::done;
+	using super::iter;
+	using super::end;
+
+	InEscString(const_iterator b, const_iterator e, const string_type& escapes_):
+		super(b, e),
+		escapes(escapes_)
+	{
+		// Even though we've already initialized 'iter' via our base-class
+		// constructor, set it again to check for initial escape char.
+		setiter(b);
+	}
+
+	/// This implementation uses the answer cached by setiter().
+	virtual bool escaped() const { return isesc; }
+	virtual T next()
+	{
+		// If we're looking at the escape character of an escape sequence,
+		// skip that character. This is the one time we can modify 'iter'
+		// without using setiter: for this one case we DO NOT CARE if the
+		// escaped character is itself an escape.
+		if (isesc)
+			++iter;
+		// If we were looking at an escape character, this is the escaped
+		// character; otherwise it's just the next character.
+		T result(*iter);
+		// Advance iter, checking for escape sequence.
+		setiter(iter + 1);
+		return result;
+	}
+
+	virtual bool is(T ch) const
+	{
+		// Like base-class is(), except that an escaped character matches
+		// nothing.
+		return (! done()) && (! isesc) && *iter == ch;
+	}
+
+	virtual bool oneof(const string_type& delims) const
+	{
+		// Like base-class oneof(), except that an escaped character matches
+		// nothing.
+		return (! done()) && (! isesc) && LLStringUtilBase<T>::contains(delims, *iter);
+	}
+
+	virtual bool collect_until(string_type& into, const_iterator from, T delim)
+	{
+		// Deal with escapes in the characters we collect; that is, an escaped
+		// character must become just that character without the preceding
+		// escape. Collect characters in a separate string rather than
+		// directly appending to 'into' in case we do not find delim, in which
+		// case we're supposed to leave 'into' unmodified.
+		string_type collected;
+		// For scanning purposes, we're going to work directly with 'iter'.
+		// Save its current value in case we fail to see delim.
+		const_iterator save_iter(iter);
+		// Okay, set 'iter', checking for escape.
+		setiter(from);
+		while (! done())
+		{
+			// If we see an unescaped delim, stop and report success.
+			if ((! isesc) && *iter == delim)
+			{
+				// Append collected chars to 'into'.
+				into.append(collected);
+				// Don't forget to advance 'iter' past delim.
+				setiter(iter + 1);
+				return true;
+			}
+			// We're not at end, and either we're not looking at delim or it's
+			// escaped. Collect this character and keep going.
+			collected.push_back(next());
+		}
+		// Here we hit 'end' without ever seeing delim. Restore iter and tell
+		// caller.
+		setiter(save_iter);
+		return false;
+	}
+
+private:
+	void setiter(const_iterator i)
+	{
+		iter = i;
+
+		// Every time we change 'iter', set 'isesc' to be able to repetitively
+		// answer escaped() without having to rescan 'escapes'. isesc caches
+		// contains(escapes, *iter).
+
+		// We're looking at an escaped char if we're not already at end (that
+		// is, *iter is even meaningful); if *iter is in fact one of the
+		// specified escape characters; and if there's one more character
+		// following it. That is, if an escape character is the very last
+		// character of the input string, it loses its special meaning.
+		isesc = (! done()) &&
+				LLStringUtilBase<T>::contains(escapes, *iter) &&
+				(iter+1) != end;
+	}
+
+	const string_type escapes;
+	bool isesc;
+};
+
+/// getTokens() implementation based on InString concept
+template <typename INSTRING, typename string_type>
+void getTokens(INSTRING& instr, std::vector<string_type>& tokens,
+			   const string_type& drop_delims, const string_type& keep_delims,
+			   const string_type& quotes)
+{
+	// There are times when we want to match either drop_delims or
+	// keep_delims. Concatenate them up front to speed things up.
+	string_type all_delims(drop_delims + keep_delims);
+	// no tokens yet
+	tokens.clear();
+
+	// try for another token
+	while (! instr.done())
+	{
+		// scan past any drop_delims
+		while (instr.oneof(drop_delims))
+		{
+			// skip this drop_delim
+			instr.next();
+			// but if that was the end of the string, done
+			if (instr.done())
+				return;
+		}
+		// found the start of another token: make a slot for it
+		tokens.push_back(string_type());
+		if (instr.oneof(keep_delims))
+		{
+			// *iter is a keep_delim, a token of exactly 1 character. Append
+			// that character to the new token and proceed.
+			tokens.back().push_back(instr.next());
+			continue;
+		}
+		// Here we have a non-delimiter token, which might consist of a mix of
+		// quoted and unquoted parts. Use bash rules for quoting: you can
+		// embed a quoted substring in the midst of an unquoted token (e.g.
+		// ~/"sub dir"/myfile.txt); you can ram two quoted substrings together
+		// to make a single token (e.g. 'He said, "'"Don't."'"'). We diverge
+		// from bash in that bash considers an unmatched quote an error. Our
+		// param signature doesn't allow for errors, so just pretend it's not
+		// a quote and embed it.
+		// At this level, keep scanning until we hit the next delimiter of
+		// either type (drop_delims or keep_delims).
+		while (! instr.oneof(all_delims))
+		{
+			// If we're looking at an open quote, search forward for
+			// a close quote, collecting characters along the way.
+			if (instr.oneof(quotes) &&
+				instr.collect_until(tokens.back(), instr.iter+1, *instr.iter))
+			{
+				// collect_until is cleverly designed to do exactly what we
+				// need here. No further action needed if it returns true.
+			}
+			else
+			{
+				// Either *iter isn't a quote, or there's no matching close
+				// quote: in other words, just an ordinary char. Append it to
+				// current token.
+				tokens.back().push_back(instr.next());
+			}
+			// having scanned that segment of this token, if we've reached the
+			// end of the string, we're done
+			if (instr.done())
+				return;
+		}
+	}
+}
+
+} // namespace LLStringUtilBaseImpl
+
+// static
+template <class T>
+void LLStringUtilBase<T>::getTokens(const string_type& string, std::vector<string_type>& tokens,
+									const string_type& drop_delims, const string_type& keep_delims,
+									const string_type& quotes)
+{
+	// Because this overload doesn't support escapes, use simple InString to
+	// manage input range.
+	LLStringUtilBaseImpl::InString<T> instring(string.begin(), string.end());
+	LLStringUtilBaseImpl::getTokens(instring, tokens, drop_delims, keep_delims, quotes);
+}
+
+// static
+template <class T>
+void LLStringUtilBase<T>::getTokens(const string_type& string, std::vector<string_type>& tokens,
+									const string_type& drop_delims, const string_type& keep_delims,
+									const string_type& quotes, const string_type& escapes)
+{
+	// This overload must deal with escapes. Delegate that to InEscString
+	// (unless there ARE no escapes).
+	boost::scoped_ptr< LLStringUtilBaseImpl::InString<T> > instrp;
+	if (escapes.empty())
+		instrp.reset(new LLStringUtilBaseImpl::InString<T>(string.begin(), string.end()));
+	else
+		instrp.reset(new LLStringUtilBaseImpl::InEscString<T>(string.begin(), string.end(), escapes));
+	LLStringUtilBaseImpl::getTokens(*instrp, tokens, drop_delims, keep_delims, quotes);
+}
 
 // static
 template<class T> 
diff --git a/indra/llcommon/tests/StringVec.h b/indra/llcommon/tests/StringVec.h
new file mode 100644
index 0000000000..a380b00a05
--- /dev/null
+++ b/indra/llcommon/tests/StringVec.h
@@ -0,0 +1,37 @@
+/**
+ * @file   StringVec.h
+ * @author Nat Goodspeed
+ * @date   2012-02-24
+ * @brief  Extend TUT ensure_equals() to handle std::vector<std::string>
+ * 
+ * $LicenseInfo:firstyear=2012&license=viewerlgpl$
+ * Copyright (c) 2012, Linden Research, Inc.
+ * $/LicenseInfo$
+ */
+
+#if ! defined(LL_STRINGVEC_H)
+#define LL_STRINGVEC_H
+
+#include <vector>
+#include <string>
+#include <iostream>
+
+typedef std::vector<std::string> StringVec;
+
+std::ostream& operator<<(std::ostream& out, const StringVec& strings)
+{
+    out << '(';
+    StringVec::const_iterator begin(strings.begin()), end(strings.end());
+    if (begin != end)
+    {
+        out << '"' << *begin << '"';
+        while (++begin != end)
+        {
+            out << ", \"" << *begin << '"';
+        }
+    }
+    out << ')';
+    return out;
+}
+
+#endif /* ! defined(LL_STRINGVEC_H) */
diff --git a/indra/llcommon/tests/listener.h b/indra/llcommon/tests/listener.h
index dcdb2412be..9c5c18a150 100644
--- a/indra/llcommon/tests/listener.h
+++ b/indra/llcommon/tests/listener.h
@@ -30,6 +30,8 @@
 #define LL_LISTENER_H
 
 #include "llsd.h"
+#include "llevents.h"
+#include "tests/StringVec.h"
 #include <iostream>
 
 /*****************************************************************************
@@ -133,24 +135,7 @@ struct Collect
         return false;
     }
     void clear() { result.clear(); }
-    typedef std::vector<std::string> StringList;
-    StringList result;
+    StringVec result;
 };
 
-std::ostream& operator<<(std::ostream& out, const Collect::StringList& strings)
-{
-    out << '(';
-    Collect::StringList::const_iterator begin(strings.begin()), end(strings.end());
-    if (begin != end)
-    {
-        out << '"' << *begin << '"';
-        while (++begin != end)
-        {
-            out << ", \"" << *begin << '"';
-        }
-    }
-    out << ')';
-    return out;
-}
-
 #endif /* ! defined(LL_LISTENER_H) */
diff --git a/indra/llcommon/tests/llstring_test.cpp b/indra/llcommon/tests/llstring_test.cpp
index 6a1cbf652a..821deeac21 100644
--- a/indra/llcommon/tests/llstring_test.cpp
+++ b/indra/llcommon/tests/llstring_test.cpp
@@ -29,7 +29,11 @@
 #include "linden_common.h"
 #include "../test/lltut.h"
 
+#include <boost/assign/list_of.hpp>
 #include "../llstring.h"
+#include "StringVec.h"
+
+using boost::assign::list_of;
 
 namespace tut
 {
@@ -750,4 +754,115 @@ namespace tut
 		ensure("empty substr.", !LLStringUtil::endsWith(empty, value));
 		ensure("empty everything.", !LLStringUtil::endsWith(empty, empty));
 	}
+
+	template<> template<>
+	void string_index_object_t::test<41>()
+	{
+		set_test_name("getTokens(\"delims\")");
+		ensure_equals("empty string", LLStringUtil::getTokens("", " "), StringVec());
+		ensure_equals("only delims",
+					  LLStringUtil::getTokens("   \r\n   ", " \r\n"), StringVec());
+		ensure_equals("sequence of delims",
+					  LLStringUtil::getTokens(",,, one ,,,", ","), list_of("one"));
+		// nat considers this a dubious implementation side effect, but I'd
+		// hate to change it now...
+		ensure_equals("noncontiguous tokens",
+					  LLStringUtil::getTokens(", ,, , one ,,,", ","), list_of("")("")("one"));
+		ensure_equals("space-padded tokens",
+					  LLStringUtil::getTokens(",    one  ,  two  ,", ","), list_of("one")("two"));
+		ensure_equals("no delims", LLStringUtil::getTokens("one", ","), list_of("one"));
+	}
+
+	// Shorthand for verifying that getTokens() behaves the same when you
+	// don't pass a string of escape characters, when you pass an empty string
+	// (different overloads), and when you pass a string of characters that
+	// aren't actually present.
+	void ensure_getTokens(const std::string& desc,
+						  const std::string& string,
+						  const std::string& drop_delims,
+						  const std::string& keep_delims,
+						  const std::string& quotes,
+						  const std::vector<std::string>& expect)
+	{
+		ensure_equals(desc + " - no esc",
+					  LLStringUtil::getTokens(string, drop_delims, keep_delims, quotes),
+					  expect);
+		ensure_equals(desc + " - empty esc",
+					  LLStringUtil::getTokens(string, drop_delims, keep_delims, quotes, ""),
+					  expect);
+		ensure_equals(desc + " - unused esc",
+					  LLStringUtil::getTokens(string, drop_delims, keep_delims, quotes, "!"),
+					  expect);
+	}
+
+	void ensure_getTokens(const std::string& desc,
+						  const std::string& string,
+						  const std::string& drop_delims,
+						  const std::string& keep_delims,
+						  const std::vector<std::string>& expect)
+	{
+		ensure_getTokens(desc, string, drop_delims, keep_delims, "", expect);
+	}
+
+	template<> template<>
+	void string_index_object_t::test<42>()
+	{
+		set_test_name("getTokens(\"delims\", etc.)");
+		// Signatures to test in this method:
+		// getTokens(string, drop_delims, keep_delims [, quotes [, escapes]])
+		// If you omit keep_delims, you get the older function (test above).
+
+		// cases like the getTokens(string, delims) tests above
+		ensure_getTokens("empty string", "", " ", "", StringVec());
+		ensure_getTokens("only delims",
+						 "   \r\n   ", " \r\n", "", StringVec());
+		ensure_getTokens("sequence of delims",
+						 ",,, one ,,,", ", ", "", list_of("one"));
+		// Note contrast with the case in the previous method
+		ensure_getTokens("noncontiguous tokens",
+						 ", ,, , one ,,,", ", ", "", list_of("one"));
+		ensure_getTokens("space-padded tokens",
+						 ",    one  ,  two  ,", ", ", "",
+                         list_of("one")("two"));
+		ensure_getTokens("no delims", "one", ",", "", list_of("one"));
+
+		// drop_delims vs. keep_delims
+		ensure_getTokens("arithmetic",
+						 " ab+def  / xx*  yy ", " ", "+-*/",
+						 list_of("ab")("+")("def")("/")("xx")("*")("yy"));
+
+		// quotes
+		ensure_getTokens("no quotes",
+						 "She said, \"Don't go.\"", " ", ",", "",
+						 list_of("She")("said")(",")("\"Don't")("go.\""));
+		ensure_getTokens("quotes",
+						 "She said, \"Don't go.\"", " ", ",", "\"",
+						 list_of("She")("said")(",")("Don't go."));
+		ensure_getTokens("quotes and delims",
+						 "run c:/'Documents and Settings'/someone", " ", "", "'",
+						 list_of("run")("c:/Documents and Settings/someone"));
+		ensure_getTokens("unmatched quote",
+						 "baby don't leave", " ", "", "'",
+						 list_of("baby")("don't")("leave"));
+		ensure_getTokens("adjacent quoted",
+						 "abc'def \"ghi'\"jkl' mno\"pqr", " ", "", "\"'",
+						 list_of("abcdef \"ghijkl' mnopqr"));
+
+		// escapes
+		// Don't use backslash as an escape for these tests -- you'll go nuts
+		// between the C++ string scanner and getTokens() escapes. Test with
+		// something else!
+		ensure_equals("escaped delims",
+					  LLStringUtil::getTokens("^ a - dog^-gone^ phrase", " ", "-", "", "^"),
+					  list_of(" a")("-")("dog-gone phrase"));
+		ensure_equals("escaped quotes",
+					  LLStringUtil::getTokens("say: 'this isn^'t w^orking'.", " ", "", "'", "^"),
+					  list_of("say:")("this isn't working."));
+		ensure_equals("escaped escape",
+					  LLStringUtil::getTokens("want x^^2", " ", "", "", "^"),
+					  list_of("want")("x^2"));
+		ensure_equals("escape at end",
+					  LLStringUtil::getTokens("it's^ up there^", " ", "", "'", "^"),
+					  list_of("it's up")("there^"));
+    }
 }