From 2f557cd7faceec36acace1eee4ee38904ff06130 Mon Sep 17 00:00:00 2001
From: Nat Goodspeed <nat@lindenlab.com>
Date: Fri, 2 Dec 2022 15:17:56 -0500
Subject: SL-18330: Fix new C++ <-> Python LLSD compatibility tests.

When sending multiple LEAP packets in the same file (for testing convenience),
use a length prefix instead of delimiting with '\n'. Now that we allow a
serialization format that includes an LLSD format header (e.g.
"<?llsd/binary?>"), '\n' is part of the packet content. But in fact, testing
binary LLSD means we can't pick any delimiter guaranteed not to appear in the
packet content.

Using a length prefix also lets us pass a specific max_bytes to the subject
C++ LLSD parser.

Make llleap_test.cpp use new freestanding Python llsd package when available.
Update Python-side LEAP protocol code to work directly with encoded bytes
stream, avoiding bytes<->str encoding and decoding, which breaks binary LLSD.

Make LLSDSerialize::deserialize() recognize LLSD format header case-
insensitively. Python emits and checks for "llsd/binary", while LLSDSerialize
emits and checks for "LLSD/Binary". Once any of the headers is recognized,
pass corrected max_bytes to the specific parser.

Make deserialize() more careful about the no-header case: preserve '\n' in
content. Introduce debugging code (disabled) because it's a little tricky to
recreate.

Revert LLLeap child process stdout parser from LLSDSerialize::deserialize() to
the specific LLSDNotationParser(), as at present: the generic parser fails one
of LLLeap's integration tests for reasons that remain mysterious.
---
 indra/llcommon/llleap.cpp                   |   9 +++
 indra/llcommon/llsdserialize.cpp            |  49 +++++++++---
 indra/llcommon/tests/llleap_test.cpp        |  32 ++++----
 indra/llcommon/tests/llsdserialize_test.cpp | 118 +++++++++++++++++++---------
 4 files changed, 147 insertions(+), 61 deletions(-)
diff --git a/indra/llcommon/llleap.cpp b/indra/llcommon/llleap.cpp
index f71eaf92c4..e8ccc4300d 100644
--- a/indra/llcommon/llleap.cpp
+++ b/indra/llcommon/llleap.cpp
@@ -317,8 +317,17 @@ public:
             LL_DEBUGS("LLLeap") << "needed " << mExpect << " bytes, got "
                                 << childout.size() << ", parsing LLSD" << LL_ENDL;
             LLSD data;
+#if 1
+            // specifically require notation LLSD from child
+            LLPointer<LLSDParser> parser(new LLSDNotationParser());
+            S32 parse_status(parser->parse(childout.get_istream(), data, mExpect));
+            if (parse_status == LLSDParser::PARSE_FAILURE)
+#else
+            // SL-18330: accept any valid LLSD serialization format from child
+            // Unfortunately this runs into trouble we have not yet debugged.
             bool parse_status(LLSDSerialize::deserialize(data, childout.get_istream(), mExpect));
             if (! parse_status)
+#endif
             {
                 bad_protocol("unparseable LLSD data");
             }
diff --git a/indra/llcommon/llsdserialize.cpp b/indra/llcommon/llsdserialize.cpp
index dc8f8f5737..97bf51eeaa 100644
--- a/indra/llcommon/llsdserialize.cpp
+++ b/indra/llcommon/llsdserialize.cpp
@@ -123,12 +123,26 @@ bool LLSDSerialize::deserialize(LLSD& sd, std::istream& str, S32 max_bytes)
 	bool fail_if_not_legacy = false;
 
 	/*
-	 * Get the first line before anything.
+	 * Get the first line before anything. Don't read more than max_bytes:
+	 * this get() overload reads no more than (count-1) bytes into the
+	 * specified buffer. In the usual case when max_bytes exceeds
+	 * sizeof(hdr_buf), get() will read no more than sizeof(hdr_buf)-2.
 	 */
-	// Remember str's original input position: if there's no header, we'll
-	// want to back up and retry.
-	str.get(hdr_buf, sizeof(hdr_buf), '\n');
+	str.get(hdr_buf, std::min(max_bytes+1, sizeof(hdr_buf)-1), '\n');
 	auto inbuf = str.gcount();
+	// https://en.cppreference.com/w/cpp/io/basic_istream/get
+	// When the get() above sees the specified delimiter '\n', it stops there
+	// without pulling it from the stream. If it turns out that the stream
+	// does NOT contain a header, and the content includes meaningful '\n',
+	// it's important to pull that into hdr_buf too.
+	if (inbuf < max_bytes && str.get(hdr_buf[inbuf]))
+	{
+		// got the delimiting '\n'
+		++inbuf;
+		// None of the following requires that hdr_buf contain a final '\0'
+		// byte. We could store one if needed, since even the incremented
+		// inbuf won't exceed sizeof(hdr_buf)-1, but there's no need.
+	}
 	std::string header{ hdr_buf, inbuf };
 	if (str.fail())
 	{
@@ -175,17 +189,17 @@ bool LLSDSerialize::deserialize(LLSD& sd, std::istream& str, S32 max_bytes)
 	/*
 	 * Create the parser as appropriate
 	 */
-	if (header == LLSD_BINARY_HEADER)
+	if (0 == LLStringUtil::compareInsensitive(header, LLSD_BINARY_HEADER))
 	{
-		return (parse_using<LLSDBinaryParser>(str, sd, max_bytes) > 0);
+		return (parse_using<LLSDBinaryParser>(str, sd, max_bytes-inbuf) > 0);
 	}
-	else if (header == LLSD_XML_HEADER)
+	else if (0 == LLStringUtil::compareInsensitive(header, LLSD_XML_HEADER))
 	{
-		return (parse_using<LLSDXMLParser>(str, sd, max_bytes) > 0);
+		return (parse_using<LLSDXMLParser>(str, sd, max_bytes-inbuf) > 0);
 	}
-	else if (header == LLSD_NOTATION_HEADER)
+	else if (0 == LLStringUtil::compareInsensitive(header, LLSD_NOTATION_HEADER))
 	{
-		return (parse_using<LLSDNotationParser>(str, sd, max_bytes) > 0);
+		return (parse_using<LLSDNotationParser>(str, sd, max_bytes-inbuf) > 0);
 	}
 	else // no header we recognize
 	{
@@ -207,7 +221,22 @@ bool LLSDSerialize::deserialize(LLSD& sd, std::istream& str, S32 max_bytes)
 		LLMemoryStreamBuf already(reinterpret_cast<const U8*>(hdr_buf), inbuf);
 		cat_streambuf prebuff(&already, str.rdbuf());
 		std::istream  prepend(&prebuff);
+#if 1
 		return (p->parse(prepend, sd, max_bytes) > 0);
+#else
+		// debugging the reconstituted 'prepend' stream
+		// allocate a buffer that we hope is big enough for the whole thing
+		std::vector<char> wholemsg((max_bytes == size_t(SIZE_UNLIMITED))? 1024 : max_bytes);
+		prepend.read(wholemsg.data(), std::min(max_bytes, wholemsg.size()));
+		LLMemoryStream replay(reinterpret_cast<const U8*>(wholemsg.data()), prepend.gcount());
+		auto success{ p->parse(replay, sd, prepend.gcount()) > 0 };
+		{
+			LL_DEBUGS() << (success? "parsed: $$" : "failed: '")
+						<< std::string(wholemsg.data(), llmin(prepend.gcount(), 100)) << "$$"
+						<< LL_ENDL;
+		}
+		return success;
+#endif
 	}
 }
 
diff --git a/indra/llcommon/tests/llleap_test.cpp b/indra/llcommon/tests/llleap_test.cpp
index 9754353ab0..6c799b7993 100644
--- a/indra/llcommon/tests/llleap_test.cpp
+++ b/indra/llcommon/tests/llleap_test.cpp
@@ -110,12 +110,12 @@ namespace tut
                    "import os\n"
                    "import sys\n"
                    "\n"
-                   // Don't forget that this Python script is written to some
-                   // temp directory somewhere! Its __file__ is useless in
-                   // finding indra/lib/python. Use our __FILE__, with
-                   // raw-string syntax to deal with Windows pathnames.
-                   "mydir = os.path.dirname(r'" << __FILE__ << "')\n"
-                   "from llbase import llsd\n"
+                   "try:\n"
+                   // new freestanding llsd package
+                   "    import llsd\n"
+                   "except ImportError:\n"
+                   // older llbase.llsd module
+                   "    from llbase import llsd\n"
                    "\n"
                    "class ProtocolError(Exception):\n"
                    "    def __init__(self, msg, data):\n"
@@ -126,26 +126,26 @@ namespace tut
                    "    pass\n"
                    "\n"
                    "def get():\n"
-                   "    hdr = ''\n"
-                   "    while ':' not in hdr and len(hdr) < 20:\n"
-                   "        hdr += sys.stdin.read(1)\n"
+                   "    hdr = []\n"
+                   "    while b':' not in hdr and len(hdr) < 20:\n"
+                   "        hdr.append(sys.stdin.buffer.read(1))\n"
                    "        if not hdr:\n"
                    "            sys.exit(0)\n"
-                   "    if not hdr.endswith(':'):\n"
+                   "    if not hdr[-1] == b':':\n"
                    "        raise ProtocolError('Expected len:data, got %r' % hdr, hdr)\n"
                    "    try:\n"
-                   "        length = int(hdr[:-1])\n"
+                   "        length = int(b''.join(hdr[:-1]))\n"
                    "    except ValueError:\n"
                    "        raise ProtocolError('Non-numeric len %r' % hdr[:-1], hdr[:-1])\n"
                    "    parts = []\n"
                    "    received = 0\n"
                    "    while received < length:\n"
-                   "        parts.append(sys.stdin.read(length - received))\n"
+                   "        parts.append(sys.stdin.buffer.read(length - received))\n"
                    "        received += len(parts[-1])\n"
-                   "    data = ''.join(parts)\n"
+                   "    data = b''.join(parts)\n"
                    "    assert len(data) == length\n"
                    "    try:\n"
-                   "        return llsd.parse(data.encode())\n"
+                   "        return llsd.parse(data)\n"
                    //   Seems the old indra.base.llsd module didn't properly
                    //   convert IndexError (from running off end of string) to
                    //   LLSDParseError.
@@ -185,11 +185,11 @@ namespace tut
                    "    return _reply\n"
                    "\n"
                    "def put(req):\n"
-                   "    sys.stdout.write(':'.join((str(len(req)), req)))\n"
+                   "    sys.stdout.buffer.write(b'%d:%b' % (len(req), req))\n"
                    "    sys.stdout.flush()\n"
                    "\n"
                    "def send(pump, data):\n"
-                   "    put(llsd.format_notation(dict(pump=pump, data=data)).decode())\n"
+                   "    put(llsd.format_notation(dict(pump=pump, data=data)))\n"
                    "\n"
                    "def request(pump, data):\n"
                    "    # we expect 'data' is a dict\n"
diff --git a/indra/llcommon/tests/llsdserialize_test.cpp b/indra/llcommon/tests/llsdserialize_test.cpp
index 601f2c580d..39b93cf335 100644
--- a/indra/llcommon/tests/llsdserialize_test.cpp
+++ b/indra/llcommon/tests/llsdserialize_test.cpp
@@ -51,10 +51,11 @@ typedef U32 uint32_t;
 #include "boost/phoenix/core/argument.hpp"
 using namespace boost::phoenix;
 
-#include "../llsd.h"
-#include "../llsdserialize.h"
+#include "llsd.h"
+#include "llsdserialize.h"
 #include "llsdutil.h"
-#include "../llformat.h"
+#include "llformat.h"
+#include "llmemorystream.h"
 
 #include "../test/lltut.h"
 #include "../test/namedtempfile.h"
@@ -1898,14 +1899,22 @@ namespace tut
     static void writeLLSDArray(const FormatterFunction& serialize,
                                std::ostream& out, const LLSD& array)
     {
-        BOOST_FOREACH(LLSD item, llsd::inArray(array))
+        for (const LLSD& item : llsd::inArray(array))
         {
-            serialize(item, out);
-            // It's important to separate with newlines because Python's llsd
-            // module doesn't support parsing from a file stream, only from a
-            // string, so we have to know how much of the file to read into a
-            // string.
-            out << '\n';
+            // It's important to delimit the entries in this file somehow
+            // because, although Python's llsd.parse() can accept a file
+            // stream, the XML parser expects EOF after a single outer element
+            // -- it doesn't just stop. So we must extract a sequence of bytes
+            // strings from the file. But since one of the serialization
+            // formats we want to test is binary, we can't pick any single
+            // byte value as a delimiter! Use a binary integer length prefix
+            // instead.
+            std::ostringstream buffer;
+            serialize(item, buffer);
+            auto buffstr{ buffer.str() };
+            int bufflen{ static_cast<int>(buffstr.length()) };
+            out.write(reinterpret_cast<const char*>(&bufflen), sizeof(bufflen));
+            out.write(buffstr.c_str(), buffstr.length());
         }
     }
 
@@ -1932,7 +1941,7 @@ namespace tut
             "    except StopIteration:\n"
             "        pass\n"
             "    else:\n"
-            "        assert False, 'Too many data items'\n";
+            "        raise AssertionError('Too many data items')\n";
 
         // Create an llsdXXXXXX file containing 'data' serialized to
         // notation.
@@ -1948,10 +1957,23 @@ namespace tut
         python("read C++ " + desc,
                placeholders::arg1 <<
                import_llsd <<
-               "def parse_each(iterable):\n"
-               "    for item in iterable:\n"
-               "        yield llsd.parse(item)\n" <<
-               pydata <<
+               "from functools import partial\n"
+               "import struct\n"
+               "lenformat = struct.Struct('i')\n"
+               "def parse_each(inf):\n"
+               "    for rawlen in iter(partial(inf.read, lenformat.size), b''):\n"
+               "        len = lenformat.unpack(rawlen)[0]\n"
+               // Since llsd.parse() has no max_bytes argument, instead of
+               // passing the input stream directly to parse(), read the item
+               // into a distinct bytes object and parse that.
+               "        data = inf.read(len)\n"
+               "        try:\n"
+               "            yield llsd.parse(data)\n"
+               "        except llsd.LLSDParseError as err:\n"
+               "            print(f'*** {err}')\n"
+               "            print(f'Bad content:\\n{data!r}')\n"
+               "            raise\n"
+               << pydata <<
                // Don't forget raw-string syntax for Windows pathnames.
                "verify(parse_each(open(r'" << file.getName() << "', 'rb')))\n");
     }
@@ -2008,6 +2030,26 @@ namespace tut
     }
 |*==========================================================================*/
 
+    // helper for test<8> - test<12>
+    bool itemFromStream(std::istream& istr, LLSD& item, const ParserFunction& parse)
+    {
+        // reset the output value for debugging clarity
+        item.clear();
+        // We use an int length prefix as a foolproof delimiter even for
+        // binary serialized streams.
+        int length{ 0 };
+        istr.read(reinterpret_cast<char*>(&length), sizeof(length));
+//      return parse(istr, item, length);
+        // Sadly, as of 2022-12-01 it seems we can't really trust our LLSD
+        // parsers to honor max_bytes: this test works better when we read
+        // each item into its own distinct LLMemoryStream, instead of passing
+        // the original istr with a max_bytes constraint.
+        std::vector<U8> buffer(length);
+        istr.read(reinterpret_cast<char*>(buffer.data()), length);
+        LLMemoryStream stream(buffer.data(), length);
+        return parse(stream, item, length);
+    }
+
     // helper for test<8> - test<12>
     void fromPythonUsing(const std::string& pyformatter,
                          const ParserFunction& parse=
@@ -2022,6 +2064,8 @@ namespace tut
         python("Python " + pyformatter,
                placeholders::arg1 <<
                import_llsd <<
+               "import struct\n"
+               "lenformat = struct.Struct('i')\n"
                "DATA = [\n"
                "    17,\n"
                "    3.14,\n"
@@ -2034,29 +2078,33 @@ namespace tut
                // N.B. Using 'print' implicitly adds newlines.
                "with open(r'" << file.getName() << "', 'wb') as f:\n"
                "    for item in DATA:\n"
-               "        print(llsd." << pyformatter << "(item), file=f)\n");
+               "        serialized = llsd." << pyformatter << "(item)\n"
+               "        f.write(lenformat.pack(len(serialized)))\n"
+               "        f.write(serialized)\n");
 
         std::ifstream inf(file.getName().c_str());
         LLSD item;
-        // Notice that we're not doing anything special to parse out the
-        // newlines: LLSDSerialize::fromNotation ignores them. While it would
-        // seem they're not strictly necessary, going in this direction, we
-        // want to ensure that notation-separated-by-newlines works in both
-        // directions -- since in practice, a given file might be read by
-        // either language.
-        ensure("Failed to read LLSD::Integer from Python",
-               parse(inf, item, LLSDSerialize::SIZE_UNLIMITED));
-        ensure_equals(item.asInteger(), 17);
-        ensure("Failed to read LLSD::Real from Python",
-               parse(inf, item, LLSDSerialize::SIZE_UNLIMITED));
-        ensure_approximately_equals("Bad LLSD::Real value from Python",
-                                    item.asReal(), 3.14, 7); // 7 bits ~= 0.01
-        ensure("Failed to read LLSD::String from Python",
-               parse(inf, item, LLSDSerialize::SIZE_UNLIMITED));
-        ensure_equals(item.asString(), 
-                      "This string\n"
-                      "has several\n"
-                      "lines.");
+        try
+        {
+            ensure("Failed to read LLSD::Integer from Python",
+                   itemFromStream(inf, item, parse));
+            ensure_equals(item.asInteger(), 17);
+            ensure("Failed to read LLSD::Real from Python",
+                   itemFromStream(inf, item, parse));
+            ensure_approximately_equals("Bad LLSD::Real value from Python",
+                                        item.asReal(), 3.14, 7); // 7 bits ~= 0.01
+            ensure("Failed to read LLSD::String from Python",
+                   itemFromStream(inf, item, parse));
+            ensure_equals(item.asString(), 
+                          "This string\n"
+                          "has several\n"
+                          "lines.");
+        }
+        catch (const tut::failure& err)
+        {
+            std::cout << "for " << err.what() << ", item = " << item << std::endl;
+            throw;
+        }
     }
 
     template<> template<>
-- 
cgit v1.2.3