Add LLStringUtil::getTokens() overload handling quoted substrings.

We didn't have any tokenizer suitable for scanning something like a bash command line. We do have a couple hacks, e.g. LLExternalEditor::tokenize() and LLCommandLineParser::parseCommandLineString(). Both try to work around boost::tokenizer limitations; but existing boost::tokenizer support just doesn't address this case. Neither of the above is available as a general scanner anyway, and parseCommandLineString() fails outright when passed "". New getTokens() also distinguishes between "drop delimiters" (e.g. space, return, newline) to be discarded from the token stream, versus "keep delimiters" (e.g. "+-*/") to be returned as tokens in their own right. There's an overload that honors escapes and a more efficient one that doesn't; each has a convenience overload that returns the scanned string vector rather than requiring a separate declaration. Tweak and comment older getTokens() implementation. Add unit tests for both old and new getTokens() implementations. Break out StringVec and std::ostream << StringVec from indra/llcommon/tests/listener.h to StringVec.h: that's coming in handy for a number of different TUT test sources.
author: Nat Goodspeed <nat@lindenlab.com> 2012-02-24 15:06:44 -0500
committer: Nat Goodspeed <nat@lindenlab.com> 2012-02-24 15:06:44 -0500
commit: 025329b6a2ecb8ddee3022d6a73344f862f0d326 (patch)
tree: d383364dafa59827a3336b929600416e2c839bf0 /indra/llcommon/llstring.h
parent: 14ddc6474a0ae83db8d034b00138289fb15e41b7 (diff)
1 files changed, 353 insertions, 2 deletions
diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h
index 7b24b5e279..e4ae54cec5 100644
--- a/indra/llcommon/llstring.h
+++ b/indra/llcommon/llstring.h
@@ -40,6 +40,7 @@
 #endif
 
 #include <string.h>
+#include <boost/scoped_ptr.hpp>
 
 #if LL_SOLARIS
 // stricmp and strnicmp do not exist on Solaris:
@@ -247,7 +248,38 @@ public:
 	static const string_type null;
 	
 	typedef std::map<LLFormatMapString, LLFormatMapString> format_map_t;
-	LL_COMMON_API static void getTokens(const string_type& instr, std::vector<string_type >& tokens, const string_type& delims);
+	/// considers any sequence of delims as a single field separator
+	LL_COMMON_API static void getTokens(const string_type& instr,
+										std::vector<string_type >& tokens,
+										const string_type& delims);
+	/// like simple scan overload, but returns scanned vector
+	LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr,
+															const string_type& delims);
+	/// add support for keep_delims and quotes (either could be empty string)
+	LL_COMMON_API static void getTokens(const string_type& instr,
+										std::vector<string_type>& tokens,
+										const string_type& drop_delims,
+										const string_type& keep_delims,
+										const string_type& quotes=string_type());
+	/// like keep_delims-and-quotes overload, but returns scanned vector
+	LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr,
+															const string_type& drop_delims,
+															const string_type& keep_delims,
+															const string_type& quotes=string_type());
+	/// add support for escapes (could be empty string)
+	LL_COMMON_API static void getTokens(const string_type& instr,
+										std::vector<string_type>& tokens,
+										const string_type& drop_delims,
+										const string_type& keep_delims,
+										const string_type& quotes,
+										const string_type& escapes);
+	/// like escapes overload, but returns scanned vector
+	LL_COMMON_API static std::vector<string_type> getTokens(const string_type& instr,
+															const string_type& drop_delims,
+															const string_type& keep_delims,
+															const string_type& quotes,
+															const string_type& escapes);
+
 	LL_COMMON_API static void formatNumber(string_type& numStr, string_type decimals);
 	LL_COMMON_API static bool formatDatetime(string_type& replacement, string_type token, string_type param, S32 secFromEpoch);
 	LL_COMMON_API static S32 format(string_type& s, const format_map_t& substitutions);
@@ -262,6 +294,11 @@ public:
 		return !string.empty() && (0 <= i) && (i <= string.size());
 	}
 
+	static bool contains(const string_type& string, T c, size_type i=0)
+	{
+		return string.find(c, i) != string_type::npos;
+	}
+
 	static void	trimHead(string_type& string);
 	static void	trimTail(string_type& string);
 	static void	trim(string_type& string)	{ trimHead(string); trimTail(string); }
@@ -650,10 +687,324 @@ namespace LLStringFn
 ////////////////////////////////////////////////////////////
 // NOTE: LLStringUtil::format, getTokens, and support functions moved to llstring.cpp.
 // There is no LLWStringUtil::format implementation currently.
-// Calling thse for anything other than LLStringUtil will produce link errors.
+// Calling these for anything other than LLStringUtil will produce link errors.
 
 ////////////////////////////////////////////////////////////
 
+// static
+template <class T>
+std::vector<typename LLStringUtilBase<T>::string_type>
+LLStringUtilBase<T>::getTokens(const string_type& instr, const string_type& delims)
+{
+	std::vector<string_type> tokens;
+	getTokens(instr, tokens, delims);
+	return tokens;
+}
+
+// static
+template <class T>
+std::vector<typename LLStringUtilBase<T>::string_type>
+LLStringUtilBase<T>::getTokens(const string_type& instr,
+							   const string_type& drop_delims,
+							   const string_type& keep_delims,
+							   const string_type& quotes)
+{
+	std::vector<string_type> tokens;
+	getTokens(instr, tokens, drop_delims, keep_delims, quotes);
+	return tokens;
+}
+
+// static
+template <class T>
+std::vector<typename LLStringUtilBase<T>::string_type>
+LLStringUtilBase<T>::getTokens(const string_type& instr,
+							   const string_type& drop_delims,
+							   const string_type& keep_delims,
+							   const string_type& quotes,
+							   const string_type& escapes)
+{
+	std::vector<string_type> tokens;
+	getTokens(instr, tokens, drop_delims, keep_delims, quotes, escapes);
+	return tokens;
+}
+
+namespace LLStringUtilBaseImpl
+{
+
+/**
+ * Input string scanner helper for getTokens(), or really any other
+ * character-parsing routine that may have to deal with escape characters.
+ * This implementation defines the concept (also an interface, should you
+ * choose to implement the concept by subclassing) and provides trivial
+ * implementations for a string @em without escape processing.
+ */
+template <class T>
+struct InString
+{
+	typedef std::basic_string<T> string_type;
+	typedef typename string_type::const_iterator const_iterator;
+
+	InString(const_iterator b, const_iterator e):
+		iter(b),
+		end(e)
+	{}
+
+	bool done() const { return iter == end; }
+	/// Is the current character (*iter) escaped? This implementation can
+	/// answer trivially because it doesn't support escapes.
+	virtual bool escaped() const { return false; }
+	/// Obtain the current character and advance @c iter.
+	virtual T next() { return *iter++; }
+	/// Does the current character match specified character?
+	virtual bool is(T ch) const { return (! done()) && *iter == ch; }
+	/// Is the current character any one of the specified characters?
+	virtual bool oneof(const string_type& delims) const
+	{
+		return (! done()) && LLStringUtilBase<T>::contains(delims, *iter);
+	}
+
+	/**
+	 * Scan forward from @from until either @a delim or end. This is primarily
+	 * useful for processing quoted substrings.
+	 *
+	 * If we do see @a delim, append everything from @from until (excluding)
+	 * @a delim to @a into, advance @c iter to skip @a delim, and return @c
+	 * true.
+	 *
+	 * If we do not see @a delim, do not alter @a into or @c iter and return
+	 * @c false. Do not pass GO, do not collect $200.
+	 *
+	 * @note The @c false case described above implements normal getTokens()
+	 * treatment of an unmatched open quote: treat the quote character as if
+	 * escaped, that is, simply collect it as part of the current token. Other
+	 * plausible behaviors directly affect the way getTokens() deals with an
+	 * unmatched quote: e.g. throwing an exception to treat it as an error, or
+	 * assuming a close quote beyond end of string (in which case return @c
+	 * true).
+	 */
+	virtual bool collect_until(string_type& into, const_iterator from, T delim)
+	{
+		const_iterator found = std::find(from, end, delim);
+		// If we didn't find delim, change nothing, just tell caller.
+		if (found == end)
+			return false;
+		// Found delim! Append everything between from and found.
+		into.append(from, found);
+		// advance past delim in input
+		iter = found + 1;
+		return true;
+	}
+
+	const_iterator iter, end;
+};
+
+/// InString subclass that handles escape characters
+template <class T>
+class InEscString: public InString<T>
+{
+public:
+	typedef InString<T> super;
+	typedef typename super::string_type string_type;
+	typedef typename super::const_iterator const_iterator;
+	using super::done;
+	using super::iter;
+	using super::end;
+
+	InEscString(const_iterator b, const_iterator e, const string_type& escapes_):
+		super(b, e),
+		escapes(escapes_)
+	{
+		// Even though we've already initialized 'iter' via our base-class
+		// constructor, set it again to check for initial escape char.
+		setiter(b);
+	}
+
+	/// This implementation uses the answer cached by setiter().
+	virtual bool escaped() const { return isesc; }
+	virtual T next()
+	{
+		// If we're looking at the escape character of an escape sequence,
+		// skip that character. This is the one time we can modify 'iter'
+		// without using setiter: for this one case we DO NOT CARE if the
+		// escaped character is itself an escape.
+		if (isesc)
+			++iter;
+		// If we were looking at an escape character, this is the escaped
+		// character; otherwise it's just the next character.
+		T result(*iter);
+		// Advance iter, checking for escape sequence.
+		setiter(iter + 1);
+		return result;
+	}
+
+	virtual bool is(T ch) const
+	{
+		// Like base-class is(), except that an escaped character matches
+		// nothing.
+		return (! done()) && (! isesc) && *iter == ch;
+	}
+
+	virtual bool oneof(const string_type& delims) const
+	{
+		// Like base-class oneof(), except that an escaped character matches
+		// nothing.
+		return (! done()) && (! isesc) && LLStringUtilBase<T>::contains(delims, *iter);
+	}
+
+	virtual bool collect_until(string_type& into, const_iterator from, T delim)
+	{
+		// Deal with escapes in the characters we collect; that is, an escaped
+		// character must become just that character without the preceding
+		// escape. Collect characters in a separate string rather than
+		// directly appending to 'into' in case we do not find delim, in which
+		// case we're supposed to leave 'into' unmodified.
+		string_type collected;
+		// For scanning purposes, we're going to work directly with 'iter'.
+		// Save its current value in case we fail to see delim.
+		const_iterator save_iter(iter);
+		// Okay, set 'iter', checking for escape.
+		setiter(from);
+		while (! done())
+		{
+			// If we see an unescaped delim, stop and report success.
+			if ((! isesc) && *iter == delim)
+			{
+				// Append collected chars to 'into'.
+				into.append(collected);
+				// Don't forget to advance 'iter' past delim.
+				setiter(iter + 1);
+				return true;
+			}
+			// We're not at end, and either we're not looking at delim or it's
+			// escaped. Collect this character and keep going.
+			collected.push_back(next());
+		}
+		// Here we hit 'end' without ever seeing delim. Restore iter and tell
+		// caller.
+		setiter(save_iter);
+		return false;
+	}
+
+private:
+	void setiter(const_iterator i)
+	{
+		iter = i;
+
+		// Every time we change 'iter', set 'isesc' to be able to repetitively
+		// answer escaped() without having to rescan 'escapes'. isesc caches
+		// contains(escapes, *iter).
+
+		// We're looking at an escaped char if we're not already at end (that
+		// is, *iter is even meaningful); if *iter is in fact one of the
+		// specified escape characters; and if there's one more character
+		// following it. That is, if an escape character is the very last
+		// character of the input string, it loses its special meaning.
+		isesc = (! done()) &&
+				LLStringUtilBase<T>::contains(escapes, *iter) &&
+				(iter+1) != end;
+	}
+
+	const string_type escapes;
+	bool isesc;
+};
+
+/// getTokens() implementation based on InString concept
+template <typename INSTRING, typename string_type>
+void getTokens(INSTRING& instr, std::vector<string_type>& tokens,
+			   const string_type& drop_delims, const string_type& keep_delims,
+			   const string_type& quotes)
+{
+	// There are times when we want to match either drop_delims or
+	// keep_delims. Concatenate them up front to speed things up.
+	string_type all_delims(drop_delims + keep_delims);
+	// no tokens yet
+	tokens.clear();
+
+	// try for another token
+	while (! instr.done())
+	{
+		// scan past any drop_delims
+		while (instr.oneof(drop_delims))
+		{
+			// skip this drop_delim
+			instr.next();
+			// but if that was the end of the string, done
+			if (instr.done())
+				return;
+		}
+		// found the start of another token: make a slot for it
+		tokens.push_back(string_type());
+		if (instr.oneof(keep_delims))
+		{
+			// *iter is a keep_delim, a token of exactly 1 character. Append
+			// that character to the new token and proceed.
+			tokens.back().push_back(instr.next());
+			continue;
+		}
+		// Here we have a non-delimiter token, which might consist of a mix of
+		// quoted and unquoted parts. Use bash rules for quoting: you can
+		// embed a quoted substring in the midst of an unquoted token (e.g.
+		// ~/"sub dir"/myfile.txt); you can ram two quoted substrings together
+		// to make a single token (e.g. 'He said, "'"Don't."'"'). We diverge
+		// from bash in that bash considers an unmatched quote an error. Our
+		// param signature doesn't allow for errors, so just pretend it's not
+		// a quote and embed it.
+		// At this level, keep scanning until we hit the next delimiter of
+		// either type (drop_delims or keep_delims).
+		while (! instr.oneof(all_delims))
+		{
+			// If we're looking at an open quote, search forward for
+			// a close quote, collecting characters along the way.
+			if (instr.oneof(quotes) &&
+				instr.collect_until(tokens.back(), instr.iter+1, *instr.iter))
+			{
+				// collect_until is cleverly designed to do exactly what we
+				// need here. No further action needed if it returns true.
+			}
+			else
+			{
+				// Either *iter isn't a quote, or there's no matching close
+				// quote: in other words, just an ordinary char. Append it to
+				// current token.
+				tokens.back().push_back(instr.next());
+			}
+			// having scanned that segment of this token, if we've reached the
+			// end of the string, we're done
+			if (instr.done())
+				return;
+		}
+	}
+}
+
+} // namespace LLStringUtilBaseImpl
+
+// static
+template <class T>
+void LLStringUtilBase<T>::getTokens(const string_type& string, std::vector<string_type>& tokens,
+									const string_type& drop_delims, const string_type& keep_delims,
+									const string_type& quotes)
+{
+	// Because this overload doesn't support escapes, use simple InString to
+	// manage input range.
+	LLStringUtilBaseImpl::InString<T> instring(string.begin(), string.end());
+	LLStringUtilBaseImpl::getTokens(instring, tokens, drop_delims, keep_delims, quotes);
+}
+
+// static
+template <class T>
+void LLStringUtilBase<T>::getTokens(const string_type& string, std::vector<string_type>& tokens,
+									const string_type& drop_delims, const string_type& keep_delims,
+									const string_type& quotes, const string_type& escapes)
+{
+	// This overload must deal with escapes. Delegate that to InEscString
+	// (unless there ARE no escapes).
+	boost::scoped_ptr< LLStringUtilBaseImpl::InString<T> > instrp;
+	if (escapes.empty())
+		instrp.reset(new LLStringUtilBaseImpl::InString<T>(string.begin(), string.end()));
+	else
+		instrp.reset(new LLStringUtilBaseImpl::InEscString<T>(string.begin(), string.end(), escapes));
+	LLStringUtilBaseImpl::getTokens(*instrp, tokens, drop_delims, keep_delims, quotes);
+}
 
 // static
 template<class T>
author	Nat Goodspeed <nat@lindenlab.com>	2012-02-24 15:06:44 -0500
committer	Nat Goodspeed <nat@lindenlab.com>	2012-02-24 15:06:44 -0500
commit	025329b6a2ecb8ddee3022d6a73344f862f0d326 (patch)
tree	d383364dafa59827a3336b929600416e2c839bf0 /indra/llcommon/llstring.h
parent	14ddc6474a0ae83db8d034b00138289fb15e41b7 (diff)