/**
 * @file llsdserialize_xml.cpp
 * @brief XML parsers and formatters for LLSD
 *
 * $LicenseInfo:firstyear=2006&license=viewerlgpl$
 * Second Life Viewer Source Code
 * Copyright (C) 2010, Linden Research, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation;
 * version 2.1 of the License only.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
 * $/LicenseInfo$
 */

#include "linden_common.h"
#include "llsdserialize_xml.h"

#include <iostream>
#include <deque>

#include "apr_base64.h"
#include <boost/regex.hpp>

extern "C"
{
#ifdef LL_USESYSTEMLIBS
# include <expat.h>
#else
# include "expat/expat.h"
#endif
}

/**
 * LLSDXMLFormatter
 */
LLSDXMLFormatter::LLSDXMLFormatter(bool boolAlpha, const std::string& realFormat,
                                   EFormatterOptions options):
    LLSDFormatter(boolAlpha, realFormat, options)
{
}

// virtual
LLSDXMLFormatter::~LLSDXMLFormatter()
{
}

// virtual
S32 LLSDXMLFormatter::format(const LLSD& data, std::ostream& ostr,
                             EFormatterOptions options) const
{
    std::streamsize old_precision = ostr.precision(25);

    std::string post;
    if (options & LLSDFormatter::OPTIONS_PRETTY)
    {
        post = "\n";
    }
    ostr << "<llsd>" << post;
    S32 rv = format_impl(data, ostr, options, 1);
    ostr << "</llsd>\n";

    ostr.precision(old_precision);
    return rv;
}

S32 LLSDXMLFormatter::format_impl(const LLSD& data, std::ostream& ostr,
                                  EFormatterOptions options, U32 level) const
{
    S32 format_count = 1;
    std::string pre;
    std::string post;

    if (options & LLSDFormatter::OPTIONS_PRETTY)
    {
        for (U32 i = 0; i < level; i++)
        {
            pre += "    ";
        }
        post = "\n";
    }

    switch(data.type())
    {
    case LLSD::TypeMap:
        if(0 == data.size())
        {
            ostr << pre << "<map />" << post;
        }
        else
        {
            ostr << pre << "<map>" << post;
            LLSD::map_const_iterator iter = data.beginMap();
            LLSD::map_const_iterator end = data.endMap();
            for(; iter != end; ++iter)
            {
                ostr << pre << "<key>" << escapeString((*iter).first) << "</key>" << post;
                format_count += format_impl((*iter).second, ostr, options, level + 1);
            }
            ostr << pre <<  "</map>" << post;
        }
        break;

    case LLSD::TypeArray:
        if(0 == data.size())
        {
            ostr << pre << "<array />" << post;
        }
        else
        {
            ostr << pre << "<array>" << post;
            LLSD::array_const_iterator iter = data.beginArray();
            LLSD::array_const_iterator end = data.endArray();
            for(; iter != end; ++iter)
            {
                format_count += format_impl(*iter, ostr, options, level + 1);
            }
            ostr << pre << "</array>" << post;
        }
        break;

    case LLSD::TypeUndefined:
        ostr << pre << "<undef />" << post;
        break;

    case LLSD::TypeBoolean:
        ostr << pre << "<boolean>";
        if(mBoolAlpha ||
           (ostr.flags() & std::ios::boolalpha)
           )
        {
            ostr << (data.asBoolean() ? "true" : "false");
        }
        else
        {
            ostr << (data.asBoolean() ? 1 : 0);
        }
        ostr << "</boolean>" << post;
        break;

    case LLSD::TypeInteger:
        ostr << pre << "<integer>" << data.asInteger() << "</integer>" << post;
        break;

    case LLSD::TypeReal:
        ostr << pre << "<real>";
        if(mRealFormat.empty())
        {
            ostr << data.asReal();
        }
        else
        {
            formatReal(data.asReal(), ostr);
        }
        ostr << "</real>" << post;
        break;

    case LLSD::TypeUUID:
        if(data.asUUID().isNull()) ostr << pre << "<uuid />" << post;
        else ostr << pre << "<uuid>" << data.asUUID() << "</uuid>" << post;
        break;

    case LLSD::TypeString:
        if(data.asStringRef().empty()) ostr << pre << "<string />" << post;
        else ostr << pre << "<string>" << escapeString(data.asStringRef()) <<"</string>" << post;
        break;

    case LLSD::TypeDate:
        ostr << pre << "<date>" << data.asDate() << "</date>" << post;
        break;

    case LLSD::TypeURI:
        ostr << pre << "<uri>" << escapeString(data.asString()) << "</uri>" << post;
        break;

    case LLSD::TypeBinary:
    {
        const LLSD::Binary& buffer = data.asBinary();
        if(buffer.empty())
        {
            ostr << pre << "<binary />" << post;
        }
        else
        {
            // *FIX: memory inefficient.
            // *TODO: convert to use LLBase64
            ostr << pre << "<binary encoding=\"base64\">";
            int b64_buffer_length = apr_base64_encode_len(narrow<size_t>(buffer.size()));
            char* b64_buffer = new char[b64_buffer_length];
            b64_buffer_length = apr_base64_encode_binary(
                b64_buffer,
                &buffer[0],
                narrow<size_t>(buffer.size()));
            ostr.write(b64_buffer, b64_buffer_length - 1);
            delete[] b64_buffer;
            ostr << "</binary>" << post;
        }
        break;
    }
    default:
        // *NOTE: This should never happen.
        ostr << pre << "<undef />" << post;
        break;
    }
    return format_count;
}

// static
std::string LLSDXMLFormatter::escapeString(const std::string& in)
{
    std::ostringstream out;
    std::string::const_iterator it = in.begin();
    std::string::const_iterator end = in.end();
    for(; it != end; ++it)
    {
        switch((*it))
        {
        case '<':
            out << "&lt;";
            break;
        case '>':
            out << "&gt;";
            break;
        case '&':
            out << "&amp;";
            break;
        case '\'':
            out << "&apos;";
            break;
        case '"':
            out << "&quot;";
            break;
        default:
            out << (*it);
            break;
        }
    }
    return out.str();
}



class LLSDXMLParser::Impl
{
public:
    Impl(bool emit_errors);
    ~Impl();

    S32 parse(std::istream& input, LLSD& data);
    S32 parseLines(std::istream& input, LLSD& data);

    void parsePart(const char *buf, llssize len);

    void reset();

private:
    void startElementHandler(const XML_Char* name, const XML_Char** attributes);
    void endElementHandler(const XML_Char* name);
    void characterDataHandler(const XML_Char* data, int length);

    static void sStartElementHandler(
        void* userData, const XML_Char* name, const XML_Char** attributes);
    static void sEndElementHandler(
        void* userData, const XML_Char* name);
    static void sCharacterDataHandler(
        void* userData, const XML_Char* data, int length);

    void startSkipping();

    enum Element {
        ELEMENT_LLSD,
        ELEMENT_UNDEF,
        ELEMENT_BOOL,
        ELEMENT_INTEGER,
        ELEMENT_REAL,
        ELEMENT_STRING,
        ELEMENT_UUID,
        ELEMENT_DATE,
        ELEMENT_URI,
        ELEMENT_BINARY,
        ELEMENT_MAP,
        ELEMENT_ARRAY,
        ELEMENT_KEY,
        ELEMENT_UNKNOWN
    };
    static Element readElement(const XML_Char* name);

    static const XML_Char* findAttribute(const XML_Char* name, const XML_Char** pairs);

    bool mEmitErrors;

    XML_Parser  mParser;

    LLSD mResult;
    S32 mParseCount;

    bool mInLLSDElement;            // true if we're on LLSD
    bool mGracefullStop;            // true if we found the </llsd

    typedef std::deque<LLSD*> LLSDRefStack;
    LLSDRefStack mStack;

    int mDepth;
    bool mSkipping;
    int mSkipThrough;

    std::string mCurrentKey;        // Current XML <tag>
    std::string mCurrentContent;    // String data between <tag> and </tag>
};


LLSDXMLParser::Impl::Impl(bool emit_errors)
    : mEmitErrors(emit_errors)
{
    mParser = XML_ParserCreate(NULL);
    reset();
}

LLSDXMLParser::Impl::~Impl()
{
    XML_ParserFree(mParser);
}

inline bool is_eol(char c)
{
    return (c == '\n' || c == '\r');
}

void clear_eol(std::istream& input)
{
    char c = input.peek();
    while (input.good() && is_eol(c))
    {
        input.get(c);
        c = input.peek();
    }
}

static unsigned get_till_eol(std::istream& input, char *buf, unsigned bufsize)
{
    unsigned count = 0;
    while (count < bufsize && input.good())
    {
        char c = input.get();
        buf[count++] = c;
        if (is_eol(c))
            break;
    }
    return count;
}

S32 LLSDXMLParser::Impl::parse(std::istream& input, LLSD& data)
{
    XML_Status status;

    static const int BUFFER_SIZE = 1024;
    void* buffer = NULL;
    int count = 0;
    while (input.good() && !input.eof())
    {
        buffer = XML_GetBuffer(mParser, BUFFER_SIZE);

        /*
         * If we happened to end our last buffer right at the end of the llsd, but the
         * stream is still going we will get a null buffer here.  Check for mGracefullStop.
         */
        if (!buffer)
        {
            break;
        }
        count = get_till_eol(input, (char *)buffer, BUFFER_SIZE);
        if (!count)
        {
            break;
        }
        status = XML_ParseBuffer(mParser, count, false);

        if (status == XML_STATUS_ERROR)
        {
            break;
        }
    }

    // *FIX.: This code is buggy - if the stream was empty or not
    // good, there is not buffer to parse, both the call to
    // XML_ParseBuffer and the buffer manipulations are illegal
    // futhermore, it isn't clear that the expat buffer semantics are
    // preserved

    status = XML_ParseBuffer(mParser, 0, true);
    if (status == XML_STATUS_ERROR && !mGracefullStop)
    {
        if (buffer)
        {
            ((char*) buffer)[count ? count - 1 : 0] = '\0';
            if (mEmitErrors)
            {
                LL_INFOS() << "LLSDXMLParser::Impl::parse: XML_STATUS_ERROR parsing:" << (char*)buffer << LL_ENDL;
            }
        }
        else
        {
            if (mEmitErrors)
            {
                LL_INFOS() << "LLSDXMLParser::Impl::parse: XML_STATUS_ERROR, null buffer" << LL_ENDL;
            }
        }
        data = LLSD();
        return LLSDParser::PARSE_FAILURE;
    }

    clear_eol(input);
    data = mResult;
    return mParseCount;
}


S32 LLSDXMLParser::Impl::parseLines(std::istream& input, LLSD& data)
{
    XML_Status status = XML_STATUS_OK;

    data = LLSD();

    static const int BUFFER_SIZE = 1024;

    //static char last_buffer[ BUFFER_SIZE ];
    //std::streamsize last_num_read;

    // Must get rid of any leading \n, otherwise the stream gets into an error/eof state
    clear_eol(input);

    while( !mGracefullStop
        && input.good()
        && !input.eof())
    {
        void* buffer = XML_GetBuffer(mParser, BUFFER_SIZE);
        /*
         * If we happened to end our last buffer right at the end of the llsd, but the
         * stream is still going we will get a null buffer here.  Check for mGracefullStop.
         * -- I don't think this is actually true - zero 2008-05-09
         */
        if (!buffer)
        {
            break;
        }

        // Get one line
        input.getline((char*)buffer, BUFFER_SIZE);
        std::streamsize num_read = input.gcount();

        //memcpy( last_buffer, buffer, num_read );
        //last_num_read = num_read;

        if ( num_read > 0 )
        {
            if (!input.good() )
            {   // Clear state that's set when we run out of buffer
                input.clear();
            }

            // Re-insert with the \n that was absorbed by getline()
            char * text = (char *) buffer;
            if ( text[num_read - 1] == 0)
            {
                text[num_read - 1] = '\n';
            }
        }

        status = XML_ParseBuffer(mParser, (int)num_read, false);
        if (status == XML_STATUS_ERROR)
        {
            break;
        }
    }

    if (status != XML_STATUS_ERROR
        && !mGracefullStop)
    {   // Parse last bit
        status = XML_ParseBuffer(mParser, 0, true);
    }

    if (status == XML_STATUS_ERROR
        && !mGracefullStop)
    {
        if (mEmitErrors)
        {
        LL_INFOS() << "LLSDXMLParser::Impl::parseLines: XML_STATUS_ERROR" << LL_ENDL;
        }
        return LLSDParser::PARSE_FAILURE;
    }

    clear_eol(input);
    data = mResult;
    return mParseCount;
}


void LLSDXMLParser::Impl::reset()
{
    mResult.clear();
    mParseCount = 0;

    mInLLSDElement = false;
    mDepth = 0;

    mGracefullStop = false;

    mStack.clear();

    mSkipping = false;

    mCurrentKey.clear();

    XML_ParserReset(mParser, "utf-8");
    XML_SetUserData(mParser, this);
    XML_SetElementHandler(mParser, sStartElementHandler, sEndElementHandler);
    XML_SetCharacterDataHandler(mParser, sCharacterDataHandler);
}


void LLSDXMLParser::Impl::startSkipping()
{
    mSkipping = true;
    mSkipThrough = mDepth;
}

const XML_Char*
LLSDXMLParser::Impl::findAttribute(const XML_Char* name, const XML_Char** pairs)
{
    while (NULL != pairs && NULL != *pairs)
    {
        if(0 == strcmp(name, *pairs))
        {
            return *(pairs + 1);
        }
        pairs += 2;
    }
    return NULL;
}

void LLSDXMLParser::Impl::parsePart(const char* buf, llssize len)
{
    if ( buf != NULL
        && len > 0 )
    {
        XML_Status status = XML_Parse(mParser, buf, (int)len, 0);
        if (status == XML_STATUS_ERROR)
        {
            LL_INFOS() << "Unexpected XML parsing error at start" << LL_ENDL;
        }
    }
}

// Performance testing code
//#define   XML_PARSER_PERFORMANCE_TESTS

#ifdef XML_PARSER_PERFORMANCE_TESTS

extern U64 totalTime();
U64 readElementTime = 0;
U64 startElementTime = 0;
U64 endElementTime = 0;
U64 charDataTime = 0;
U64 parseTime = 0;

class XML_Timer
{
public:
    XML_Timer( U64 * sum ) : mSum( sum )
    {
        mStart = totalTime();
    }
    ~XML_Timer()
    {
        *mSum += (totalTime() - mStart);
    }

    U64 * mSum;
    U64 mStart;
};
#endif // XML_PARSER_PERFORMANCE_TESTS

void LLSDXMLParser::Impl::startElementHandler(const XML_Char* name, const XML_Char** attributes)
{
    #ifdef XML_PARSER_PERFORMANCE_TESTS
    XML_Timer timer( &startElementTime );
    #endif // XML_PARSER_PERFORMANCE_TESTS

    ++mDepth;
    if (mSkipping)
    {
        return;
    }

    Element element = readElement(name);

    mCurrentContent.clear();

    switch (element)
    {
        case ELEMENT_LLSD:
            if (mInLLSDElement) { return startSkipping(); }
            mInLLSDElement = true;
            return;

        case ELEMENT_KEY:
            if (mStack.empty()  ||  !(mStack.back()->isMap()))
            {
                return startSkipping();
            }
            return;

        case ELEMENT_BINARY:
        {
            const XML_Char* encoding = findAttribute("encoding", attributes);
            if(encoding && strcmp("base64", encoding) != 0) { return startSkipping(); }
            break;
        }

        default:
            // all rest are values, fall through
            ;
    }


    if (!mInLLSDElement) { return startSkipping(); }

    if (mStack.empty())
    {
        mStack.push_back(&mResult);
    }
    else if (mStack.back()->isMap())
    {
        if (mCurrentKey.empty()) { return startSkipping(); }

        LLSD& map = *mStack.back();
        LLSD& newElement = map[mCurrentKey];
        mStack.push_back(&newElement);

        mCurrentKey.clear();
    }
    else if (mStack.back()->isArray())
    {
        LLSD& array = *mStack.back();
        array.append(LLSD());
        LLSD& newElement = array[array.size()-1];
        mStack.push_back(&newElement);
    }
    else {
        // improperly nested value in a non-structure
        return startSkipping();
    }

    ++mParseCount;
    switch (element)
    {
        case ELEMENT_MAP:
            *mStack.back() = LLSD::emptyMap();
            break;

        case ELEMENT_ARRAY:
            *mStack.back() = LLSD::emptyArray();
            break;

        default:
            // all the other values will be set in the end element handler
            ;
    }
}

void LLSDXMLParser::Impl::endElementHandler(const XML_Char* name)
{
    #ifdef XML_PARSER_PERFORMANCE_TESTS
    XML_Timer timer( &endElementTime );
    #endif // XML_PARSER_PERFORMANCE_TESTS

    --mDepth;
    if (mSkipping)
    {
        if (mDepth < mSkipThrough)
        {
            mSkipping = false;
        }
        return;
    }

    Element element = readElement(name);

    switch (element)
    {
        case ELEMENT_LLSD:
            if (mInLLSDElement)
            {
                mInLLSDElement = false;
                mGracefullStop = true;
                XML_StopParser(mParser, false);
            }
            return;

        case ELEMENT_KEY:
            mCurrentKey = mCurrentContent;
            return;

        default:
            // all rest are values, fall through
            ;
    }

    if (!mInLLSDElement) { return; }

    LLSD& value = *mStack.back();
    mStack.pop_back();

    switch (element)
    {
        case ELEMENT_UNDEF:
            value.clear();
            break;

        case ELEMENT_BOOL:
            value = (mCurrentContent == "true" || mCurrentContent == "1");
            break;

        case ELEMENT_INTEGER:
            {
                S32 i;
                // sscanf okay here with different locales - ints don't change for different locale settings like floats do.
                if ( sscanf(mCurrentContent.c_str(), "%d", &i ) == 1 )
                {   // See if sscanf works - it's faster
                    value = i;
                }
                else
                {
                    value = LLSD(mCurrentContent).asInteger();
                }
            }
            break;

        case ELEMENT_REAL:
            {
                value = LLSD(mCurrentContent).asReal();
                // removed since this breaks when locale has decimal separator that isn't '.'
                // investigated changing local to something compatible each time but deemed higher
                // risk that just using LLSD.asReal() each time.
                //F64 r;
                //if ( sscanf(mCurrentContent.c_str(), "%lf", &r ) == 1 )
                //{ // See if sscanf works - it's faster
                //  value = r;
                //}
                //else
                //{
                //  value = LLSD(mCurrentContent).asReal();
                //}
            }
            break;

        case ELEMENT_STRING:
            value = mCurrentContent;
            break;

        case ELEMENT_UUID:
            value = LLSD(mCurrentContent).asUUID();
            break;

        case ELEMENT_DATE:
            value = LLSD(mCurrentContent).asDate();
            break;

        case ELEMENT_URI:
            value = LLSD(mCurrentContent).asURI();
            break;

        case ELEMENT_BINARY:
        {
            // Regex is expensive, but only fix for whitespace in base64,
            // created by python and other non-linden systems - DEV-39358
            // Fortunately we have very little binary passing now,
            // so performance impact shold be negligible. + poppy 2009-09-04
            boost::regex r;
            r.assign("\\s");
            std::string stripped = boost::regex_replace(mCurrentContent, r, "");
            S32 len = apr_base64_decode_len(stripped.c_str());
            std::vector<U8> data;
            data.resize(len);
            len = apr_base64_decode_binary(&data[0], stripped.c_str());
            data.resize(len);
            value = data;
            break;
        }

        case ELEMENT_UNKNOWN:
            value.clear();
            break;

        default:
            // other values, map and array, have already been set
            break;
    }

    mCurrentContent.clear();
}

void LLSDXMLParser::Impl::characterDataHandler(const XML_Char* data, int length)
{
    #ifdef XML_PARSER_PERFORMANCE_TESTS
    XML_Timer timer( &charDataTime );
    #endif  // XML_PARSER_PERFORMANCE_TESTS

    mCurrentContent.append(data, length);
}


void LLSDXMLParser::Impl::sStartElementHandler(
    void* userData, const XML_Char* name, const XML_Char** attributes)
{
    ((LLSDXMLParser::Impl*)userData)->startElementHandler(name, attributes);
}

void LLSDXMLParser::Impl::sEndElementHandler(
    void* userData, const XML_Char* name)
{
    ((LLSDXMLParser::Impl*)userData)->endElementHandler(name);
}

void LLSDXMLParser::Impl::sCharacterDataHandler(
    void* userData, const XML_Char* data, int length)
{
    ((LLSDXMLParser::Impl*)userData)->characterDataHandler(data, length);
}


/*
    This code is time critical

    This is a sample of tag occurances of text in simstate file with ~8000 objects.
    A tag pair (<key>something</key>) counts is counted as two:

        key     - 2680178
        real    - 1818362
        integer -  906078
        array   -  295682
        map     -  191818
        uuid    -  177903
        binary  -  175748
        string  -   53482
        undef   -   40353
        boolean -   33874
        llsd    -   16332
        uri     -      38
        date    -       1
*/
LLSDXMLParser::Impl::Element LLSDXMLParser::Impl::readElement(const XML_Char* name)
{
    #ifdef XML_PARSER_PERFORMANCE_TESTS
    XML_Timer timer( &readElementTime );
    #endif // XML_PARSER_PERFORMANCE_TESTS

    XML_Char c = *name;
    switch (c)
    {
        case 'k':
            if (strcmp(name, "key") == 0) { return ELEMENT_KEY; }
            break;
        case 'r':
            if (strcmp(name, "real") == 0) { return ELEMENT_REAL; }
            break;
        case 'i':
            if (strcmp(name, "integer") == 0) { return ELEMENT_INTEGER; }
            break;
        case 'a':
            if (strcmp(name, "array") == 0) { return ELEMENT_ARRAY; }
            break;
        case 'm':
            if (strcmp(name, "map") == 0) { return ELEMENT_MAP; }
            break;
        case 'u':
            if (strcmp(name, "uuid") == 0) { return ELEMENT_UUID; }
            if (strcmp(name, "undef") == 0) { return ELEMENT_UNDEF; }
            if (strcmp(name, "uri") == 0) { return ELEMENT_URI; }
            break;
        case 'b':
            if (strcmp(name, "binary") == 0) { return ELEMENT_BINARY; }
            if (strcmp(name, "boolean") == 0) { return ELEMENT_BOOL; }
            break;
        case 's':
            if (strcmp(name, "string") == 0) { return ELEMENT_STRING; }
            break;
        case 'l':
            if (strcmp(name, "llsd") == 0) { return ELEMENT_LLSD; }
            break;
        case 'd':
            if (strcmp(name, "date") == 0) { return ELEMENT_DATE; }
            break;
    }
    return ELEMENT_UNKNOWN;
}





/**
 * LLSDXMLParser
 */
LLSDXMLParser::LLSDXMLParser(bool emit_errors /* = true */) : impl(* new Impl(emit_errors))
{
}

LLSDXMLParser::~LLSDXMLParser()
{
    delete &impl;
}

void LLSDXMLParser::parsePart(const char *buf, llssize len)
{
    impl.parsePart(buf, len);
}

// virtual
S32 LLSDXMLParser::doParse(std::istream& input, LLSD& data, S32 max_depth) const
{
    LL_PROFILE_ZONE_SCOPED_CATEGORY_LLSD;

    #ifdef XML_PARSER_PERFORMANCE_TESTS
    XML_Timer timer( &parseTime );
    #endif  // XML_PARSER_PERFORMANCE_TESTS

    if (mParseLines)
    {
        // Use line-based reading (faster code)
        return impl.parseLines(input, data);
    }

    return impl.parse(input, data);
}

//  virtual
void LLSDXMLParser::doReset()
{
    impl.reset();
}