3 files changed, 346 insertions, 47 deletions
diff --git a/indra/lib/python/indra/base/lluuid.py b/indra/lib/python/indra/base/lluuid.py
index eb0c557b04..609c8cc261 100644
--- a/indra/lib/python/indra/base/lluuid.py
+++ b/indra/lib/python/indra/base/lluuid.py
@@ -212,6 +212,10 @@ class UUID(object):
                      _int2binstr(v3,4) + \
                      _int2binstr(v4,4) 
 
+
+# module-level null constant
+NULL = UUID()
+
 def printTranslatedMemory(four_hex_uints):
     """
     We expect to get the string as four hex units. eg:
diff --git a/indra/lib/python/indra/ipc/mysql_pool.py b/indra/lib/python/indra/ipc/mysql_pool.py
index 4a265a1f2e..6288a4c732 100644
--- a/indra/lib/python/indra/ipc/mysql_pool.py
+++ b/indra/lib/python/indra/ipc/mysql_pool.py
@@ -97,7 +97,7 @@ class ConnectionPool(Pool):
         except (AttributeError, DeadProcess), e:
             conn = self.create()
         # TODO figure out if we're still connected to the database
-        if conn:
+        if conn is not None:
             Pool.put(self, conn)
         else:
             self.current_size -= 1
diff --git a/indra/lib/python/indra/util/named_query.py b/indra/lib/python/indra/util/named_query.py
index 019eb6306b..c3bdd046fd 100644
--- a/indra/lib/python/indra/util/named_query.py
+++ b/indra/lib/python/indra/util/named_query.py
@@ -28,20 +28,33 @@ THE SOFTWARE.
 $/LicenseInfo$
 """
 
+import errno
 import MySQLdb
+import MySQLdb.cursors
 import os
 import os.path
+import re
 import time
 
+#import sys # *TODO: remove. only used in testing.
+#import pprint # *TODO: remove. only used in testing.
+
+try:
+    set = set
+except NameError:
+    from sets import Set as set
+
 from indra.base import llsd
 from indra.base import config
-from indra.ipc import russ
 
 _g_named_manager = None
 
-# this function is entirely intended for testing purposes,
-# because it's tricky to control the config from inside a test
 def _init_g_named_manager(sql_dir = None):
+    """Initializes a global NamedManager object to point at a
+    specified named queries hierarchy.
+
+    This function is intended entirely for testing purposes,
+    because it's tricky to control the config from inside a test."""
     if sql_dir is None:
         sql_dir = config.get('named-query-base-dir')
     global _g_named_manager
@@ -49,14 +62,14 @@ def _init_g_named_manager(sql_dir = None):
         os.path.abspath(os.path.realpath(sql_dir)))
 
 def get(name):
-    "@brief get the named query object to be used to perform queries"
+    "Get the named query object to be used to perform queries"
     if _g_named_manager is None:
         _init_g_named_manager()
     return _g_named_manager.get(name)
 
-def sql(name, params):
+def sql(connection, name, params):
     # use module-global NamedQuery object to perform default substitution
-    return get(name).sql(params)
+    return get(name).sql(connection, params)
 
 def run(connection, name, params, expect_rows = None):
     """\
@@ -72,66 +85,243 @@ Note that this function will fetch ALL rows.
     return get(name).run(connection, params, expect_rows)
 
 class ExpectationFailed(Exception):
+    """ Exception that is raised when an expectation for an sql query
+    is not met."""
     def __init__(self, message):
+        Exception.__init__(self, message)
         self.message = message
 
 class NamedQuery(object):
     def __init__(self, name, filename):
-        self._stat_interval = 5000  # 5 seconds
+        """ Construct a NamedQuery object.  The name argument is an
+        arbitrary name as a handle for the query, and the filename is
+        a path to a file containing an llsd named query document."""
+        self._stat_interval_seconds = 5  # 5 seconds
         self._name = name
         self._location = filename
+        self._alternative = dict()
+        self._last_mod_time = 0
+        self._last_check_time = 0
+        self.deleted = False
         self.load_contents()
 
     def name(self):
+        """ The name of the query. """
         return self._name
 
     def get_modtime(self):
-        return os.path.getmtime(self._location)
+        """ Returns the mtime (last modified time) of the named query
+        file, if such exists."""
+        if self._location:
+            return os.path.getmtime(self._location)
+        return 0
 
     def load_contents(self):
-        self._contents = llsd.parse(open(self._location).read())
+        """ Loads and parses the named query file into self.  Does
+        nothing if self.location is nonexistant."""
+        if self._location:
+            self._reference_contents(llsd.parse(open(self._location).read()))
+            # Check for alternative implementations
+            try:
+                for name, alt in self._contents['alternative'].items():
+                    nq = NamedQuery(name, None)
+                    nq._reference_contents(alt)
+                    self._alternative[name] = nq
+            except KeyError, e:
+                pass
+            self._last_mod_time = self.get_modtime()
+            self._last_check_time = time.time()
+
+    def _reference_contents(self, contents):
+        "Helper method which builds internal structure from parsed contents"
+        self._contents = contents
         self._ttl = int(self._contents.get('ttl', 0))
         self._return_as_map = bool(self._contents.get('return_as_map', False))
         self._legacy_dbname = self._contents.get('legacy_dbname', None)
-        self._legacy_query = self._contents.get('legacy_query', None)
-        self._options = self._contents.get('options', {})
-        self._base_query = self._contents['base_query']
 
-        self._last_mod_time = self.get_modtime()
-        self._last_check_time = time.time()
+        # reset these before doing the sql conversion because we will
+        # read them there. reset these while loading so we pick up
+        # changes.
+        self._around = set()
+        self._append = set()
+        self._integer = set()
+        self._options = self._contents.get('dynamic_where', {})
+        for key in self._options:
+            if isinstance(self._options[key], basestring):
+                self._options[key] = self._convert_sql(self._options[key])
+            elif isinstance(self._options[key], list):
+                lines = []
+                for line in self._options[key]:
+                    lines.append(self._convert_sql(line))
+                self._options[key] = lines
+            else:
+                moreopt = {}
+                for kk in self._options[key]:
+                    moreopt[kk] = self._convert_sql(self._options[key][kk]) 
+                self._options[key] = moreopt
+        self._base_query = self._convert_sql(self._contents['base_query'])
+        self._query_suffix = self._convert_sql(
+            self._contents.get('query_suffix', ''))
+
+    def _convert_sql(self, sql):
+        """convert the parsed sql into a useful internal structure.
+
+        This function has to turn the named query format into a pyformat
+        style. It also has to look for %:name% and :name% and
+        ready them for use in LIKE statements"""
+        if sql:
+            #print >>sys.stderr, "sql:",sql
+            expr = re.compile("(%?):([a-zA-Z][a-zA-Z0-9_-]*)%")
+            sql = expr.sub(self._prepare_like, sql)
+            expr = re.compile("#:([a-zA-Z][a-zA-Z0-9_-]*)")
+            sql = expr.sub(self._prepare_integer, sql)
+            expr = re.compile(":([a-zA-Z][a-zA-Z0-9_-]*)")
+            sql = expr.sub("%(\\1)s", sql)
+        return sql
+
+    def _prepare_like(self, match):
+        """This function changes LIKE statement replace behavior
+
+        It works by turning %:name% to %(_name_around)s and :name% to
+        %(_name_append)s. Since a leading '_' is not a valid keyname
+        input (enforced via unit tests), it will never clash with
+        existing keys. Then, when building the statement, the query
+        runner will generate corrected strings."""
+        if match.group(1) == '%':
+            # there is a leading % so this is treated as prefix/suffix
+            self._around.add(match.group(2))
+            return "%(" + self._build_around_key(match.group(2)) + ")s"
+        else:
+            # there is no leading %, so this is suffix only
+            self._append.add(match.group(2))
+            return "%(" + self._build_append_key(match.group(2)) + ")s"
+
+    def _build_around_key(self, key):
+        return "_" + key + "_around"
+
+    def _build_append_key(self, key):
+        return "_" + key + "_append"
+
+    def _prepare_integer(self, match):
+        """This function adjusts the sql for #:name replacements
+
+        It works by turning #:name to %(_name_as_integer)s. Since a
+        leading '_' is not a valid keyname input (enforced via unit
+        tests), it will never clash with existing keys. Then, when
+        building the statement, the query runner will generate
+        corrected strings."""
+        self._integer.add(match.group(1))
+        return "%(" + self._build_integer_key(match.group(1)) + ")s"
+
+    def _build_integer_key(self, key):
+        return "_" + key + "_as_integer"
+
+    def _strip_wildcards_to_list(self, value):
+        """Take string, and strip out the LIKE special characters.
+
+        Technically, this is database dependant, but postgresql and
+        mysql use the same wildcards, and I am not aware of a general
+        way to handle this. I think you need a sql statement of the
+        form:
+
+        LIKE_STRING( [ANY,ONE,str]... )
+
+        which would treat ANY as their any string, and ONE as their
+        single glyph, and str as something that needs database
+        specific encoding to not allow any % or _ to affect the query.
+
+        As it stands, I believe it's impossible to write a named query
+        style interface which uses like to search the entire space of
+        text available. Imagine the query:
+
+        % of brain used by average linden
+
+        In order to search for %, it must be escaped, so once you have
+        escaped the string to not do wildcard searches, and be escaped
+        for the database, and then prepended the wildcard you come
+        back with one of:
+
+        1) %\% of brain used by average linden
+        2) %%% of brain used by average linden
+
+        Then, when passed to the database to be escaped to be database
+        safe, you get back:
+        
+        1) %\\% of brain used by average linden
+        : which means search for any character sequence, followed by a
+          backslash, followed by any sequence, followed by ' of
+          brain...'
+        2) %%% of brain used by average linden
+        : which (I believe) means search for a % followed by any
+          character sequence followed by 'of brain...'
+
+        Neither of which is what we want!
+
+        So, we need a vendor (or extention) for LIKE_STRING. Anyone
+        want to write it?"""
+        utf8_value = unicode(value, "utf-8")
+        esc_list = []
+        remove_chars = set(u"%_")
+        for glyph in utf8_value:
+            if glyph in remove_chars:
+                continue
+            esc_list.append(glyph.encode("utf-8"))
+        return esc_list
+
+    def delete(self):
+        """ Makes this query unusable by deleting all the members and
+        setting the deleted member.  This is desired when the on-disk
+        query has been deleted but the in-memory copy remains."""
+        # blow away all members except _name, _location, and deleted
+        name, location = self._name, self._location
+        for key in self.__dict__.keys():
+            del self.__dict__[key]
+        self.deleted = True
+        self._name, self._location = name, location
 
     def ttl(self):
+        """ Estimated time to live of this query. Used for web
+        services to set the Expires header."""
         return self._ttl
 
     def legacy_dbname(self):
         return self._legacy_dbname
 
-    def legacy_query(self):
-        return self._legacy_query
-
     def return_as_map(self):
+        """ Returns true if this query is configured to return its
+        results as a single map (as opposed to a list of maps, the
+        normal behavior)."""
+        
         return self._return_as_map
 
-    def run(self, connection, params, expect_rows = None, use_dictcursor = True):
-        """\
-@brief given a connection, run a named query with the params
+    def for_schema(self, db_name):
+        "Look trough the alternates and return the correct query"
+        try:
+            return self._alternative[db_name]
+        except KeyError, e:
+            pass
+        return self
 
-Note that this function will fetch ALL rows. We do this because it
-opens and closes the cursor to generate the values, and this isn't a generator so the
-cursor has no life beyond the method call.
-@param cursor The connection to use (this generates its own cursor for the query)
-@param name The name of the query to run
-@param params The parameters passed into the query
-@param expect_rows The number of rows expected. Set to 1 if return_as_map is true.  Raises ExpectationFailed if the number of returned rows doesn't exactly match.  Kind of a hack.
-@param use_dictcursor Set to false to use a normal cursor and manually convert the rows to dicts.
-@return Returns the result set as a list of dicts, or, if the named query has return_as_map set to true, returns a single dict.
+    def run(self, connection, params, expect_rows = None, use_dictcursor = True):
+        """given a connection, run a named query with the params
+
+        Note that this function will fetch ALL rows. We do this because it
+        opens and closes the cursor to generate the values, and this 
+        isn't a generator so the cursor has no life beyond the method call.
+
+        @param cursor The connection to use (this generates its own cursor for the query)
+        @param name The name of the query to run
+        @param params The parameters passed into the query
+        @param expect_rows The number of rows expected. Set to 1 if return_as_map is true.  Raises ExpectationFailed if the number of returned rows doesn't exactly match.  Kind of a hack.
+        @param use_dictcursor Set to false to use a normal cursor and manually convert the rows to dicts.
+        @return Returns the result set as a list of dicts, or, if the named query has return_as_map set to true, returns a single dict.
         """
         if use_dictcursor:
             cursor = connection.cursor(MySQLdb.cursors.DictCursor)
         else:
             cursor = connection.cursor()
         
-        statement = self.sql(params)
+        statement = self.sql(connection, params)
         #print "SQL:", statement
         rows = cursor.execute(statement)
         
@@ -169,47 +359,152 @@ cursor has no life beyond the method call.
             return result_set[0]
         return result_set
 
-    def sql(self, params):
+    def sql(self, connection, params):
+        """ Generates an SQL statement from the named query document
+        and a dictionary of parameters.
+
+        """
         self.refresh()
 
         # build the query from the options available and the params
         base_query = []
         base_query.append(self._base_query)
+        #print >>sys.stderr, "base_query:",base_query
         for opt, extra_where in self._options.items():
-            if opt in params and (params[opt] == 0 or params[opt]):
-                if type(extra_where) in (dict, list, tuple):
+            if type(extra_where) in (dict, list, tuple):
+                if opt in params:
                     base_query.append(extra_where[params[opt]])
-                else:
+            else:
+                if opt in params and params[opt]:
                     base_query.append(extra_where)
-
+        if self._query_suffix:
+            base_query.append(self._query_suffix)
+        #print >>sys.stderr, "base_query:",base_query
         full_query = '\n'.join(base_query)
-        
-        # do substitution
-        sql = russ.format(full_query, params)
+
+        # Go through the query and rewrite all of the ones with the
+        # @:name syntax.
+        rewrite = _RewriteQueryForArray(params)
+        expr = re.compile("@%\(([a-zA-Z][a-zA-Z0-9_-]*)\)s")
+        full_query = expr.sub(rewrite.operate, full_query)
+        params.update(rewrite.new_params)
+
+        # build out the params for like. We only have to do this
+        # parameters which were detected to have ued the where syntax
+        # during load.
+        #
+        # * treat the incoming string as utf-8
+        # * strip wildcards
+        # * append or prepend % as appropriate
+        new_params = {}
+        for key in params:
+            if key in self._around:
+                new_value = ['%']
+                new_value.extend(self._strip_wildcards_to_list(params[key]))
+                new_value.append('%')
+                new_params[self._build_around_key(key)] = ''.join(new_value)
+            if key in self._append:
+                new_value = self._strip_wildcards_to_list(params[key])
+                new_value.append('%')
+                new_params[self._build_append_key(key)] = ''.join(new_value)
+            if key in self._integer:
+                new_params[self._build_integer_key(key)] = int(params[key])
+        params.update(new_params)
+
+        # do substitution using the mysql (non-standard) 'literal'
+        # function to do the escaping.
+        sql = full_query % connection.literal(params)
         return sql
 
     def refresh(self):
-        # only stat the file every so often
+        """ Refresh self from the file on the filesystem.
+
+        This is optimized to be callable as frequently as you wish,
+        without adding too much load.  It does so by only stat-ing the
+        file every N seconds, where N defaults to 5 and is
+        configurable through the member _stat_interval_seconds.  If the stat
+        reveals that the file has changed, refresh will re-parse the
+        contents of the file and use them to update the named query
+        instance.  If the stat reveals that the file has been deleted,
+        refresh will call self.delete to make the in-memory
+        representation unusable."""
         now = time.time()
-        if(now - self._last_check_time > self._stat_interval):
+        if(now - self._last_check_time > self._stat_interval_seconds):
             self._last_check_time = now
-            modtime = self.get_modtime()
-            if(modtime > self._last_mod_time):
-                self.load_contents()
+            try:
+                modtime = self.get_modtime()
+                if(modtime > self._last_mod_time):
+                    self.load_contents()
+            except OSError, e:
+                if e.errno == errno.ENOENT: # file not found
+                    self.delete() # clean up self
+                raise  # pass the exception along to the caller so they know that this query disappeared
 
 class NamedQueryManager(object):
+    """ Manages the lifespan of NamedQuery objects, drawing from a
+    directory hierarchy of named query documents.
+
+    In practice this amounts to a memory cache of NamedQuery objects."""
+    
     def __init__(self, named_queries_dir):
+        """ Initializes a manager to look for named queries in a
+        directory."""
         self._dir = os.path.abspath(os.path.realpath(named_queries_dir))
         self._cached_queries = {}
 
-    def sql(self, name, params):
+    def sql(self, connection, name, params):
         nq = self.get(name)
-        return nq.sql(params)
+        return nq.sql(connection, params)
         
     def get(self, name):
-        # new up/refresh a NamedQuery based on the name
+        """ Returns a NamedQuery instance based on the name, either
+        from memory cache, or by parsing from disk.
+
+        The name is simply a relative path to the directory associated
+        with the manager object.  Before returning the instance, the
+        NamedQuery object is cached in memory, so that subsequent
+        accesses don't have to read from disk or do any parsing.  This
+        means that NamedQuery objects returned by this method are
+        shared across all users of the manager object.
+        NamedQuery.refresh is used to bring the NamedQuery objects in
+        sync with the actual files on disk."""
         nq = self._cached_queries.get(name)
         if nq is None:
             nq = NamedQuery(name, os.path.join(self._dir, name))
             self._cached_queries[name] = nq
+        else:
+            try:
+                nq.refresh()
+            except OSError, e:
+                if e.errno == errno.ENOENT: # file not found
+                    del self._cached_queries[name]
+                raise # pass exception along to caller so they know that the query disappeared
+
         return nq
+
+class _RewriteQueryForArray(object):
+    "Helper class for rewriting queries with the @:name syntax"
+    def __init__(self, params):
+        self.params = params
+        self.new_params = dict()
+
+    def operate(self, match):
+        "Given a match, return the string that should be in use"
+        key = match.group(1)
+        value = self.params[key]
+        if type(value) in (list,tuple):
+            rv = []
+            for idx in range(len(value)):
+                new_key = "_" + key + "_" + str(idx)
+                self.new_params[new_key] = value[idx]
+                rv.append("%(" + new_key + ")s")
+            return ','.join(rv)
+        else:
+            # not something that can be expanded, so just drop the
+            # leading @ in the front of the match. This will mean that
+            # the single value we have, be it a string, int, whatever
+            # (other than dict) will correctly show up, eg:
+            #
+            # where foo in (@:foobar) -- foobar is a string, so we get
+            # where foo in (:foobar)
+            return match.group(0)[1:]