asset garbage collection scripts

author: Seth Alves <seth@lindenlab.com> 2007-07-25 17:35:44 +0000
committer: Seth Alves <seth@lindenlab.com> 2007-07-25 17:35:44 +0000
commit: 87955ce1eb686340ec7a3bca6ab1a02495eb5b9d (patch)
tree: 152564ba9b5b2192163cfb6c4210204ae3e20c98 /indra
parent: 1f4ffb0d7a6f94ff8daa9021fccbd5062010ac85 (diff)
1 files changed, 577 insertions, 0 deletions
diff --git a/indra/lib/python/indra/ipc/webdav.py b/indra/lib/python/indra/ipc/webdav.py
new file mode 100644
index 0000000000..d50b3f2b26
--- /dev/null
+++ b/indra/lib/python/indra/ipc/webdav.py
@@ -0,0 +1,577 @@
+"""
+@file webdav.py
+@brief Classes to make manipulation of a webdav store easier.
+
+Copyright (c) 2006-2007, Linden Research, Inc.
+$License$
+"""
+
+import sys, os, httplib, urlparse
+import socket, time
+import xml.dom.minidom
+import syslog
+# import signal
+
+__revision__ = '0'
+
+dav_debug = False
+
+
+# def urlsafe_b64decode (enc):
+#     return base64.decodestring (enc.replace ('_', '/').replace ('-', '+'))
+
+# def urlsafe_b64encode (str):
+#     return base64.encodestring (str).replace ('+', '-').replace ('/', '_')
+
+
+class DAVError (Exception):
+    """ Base class for exceptions in this module. """
+    def __init__ (self, status=0, message='', body='', details=''):
+        self.status = status
+        self.message = message
+        self.body = body
+        self.details = details
+        Exception.__init__ (self, '%d:%s:%s%s' % (self.status, self.message,
+                                                   self.body, self.details))
+
+    def print_to_stderr (self):
+        """ print_to_stderr docstring """
+        print >> sys.stderr, str (self.status) + ' ' + self.message
+        print >> sys.stderr, str (self.details)
+
+
+class Timeout (Exception):
+    """ Timeout docstring """
+    def __init__ (self, arg=''):
+        Exception.__init__ (self, arg)
+
+
+def alarm_handler (signum, frame):
+    """ alarm_handler docstring """
+    raise Timeout ('caught alarm')
+
+
+class WebDAV:
+    """ WebDAV docstring """
+    def __init__ (self, url, proxy=None, retries_before_fail=6):
+        self.init_url = url
+        self.init_proxy = proxy
+        self.retries_before_fail = retries_before_fail
+        url_parsed = urlparse.urlsplit (url)
+
+        self.top_path = url_parsed[ 2 ]
+        # make sure top_path has a trailing /
+        if self.top_path == None or self.top_path == '':
+            self.top_path = '/'
+        elif len (self.top_path) > 1 and self.top_path[-1:] != '/':
+            self.top_path += '/'
+
+        if dav_debug:
+            syslog.syslog ('new WebDAV %s : %s' % (str (url), str (proxy)))
+
+        if proxy:
+            proxy_parsed = urlparse.urlsplit (proxy)
+            self.host_header = url_parsed[ 1 ]
+            host_and_port = proxy_parsed[ 1 ].split (':')
+            self.host = host_and_port[ 0 ]
+            if len (host_and_port) > 1:
+                self.port = int(host_and_port[ 1 ])
+            else:
+                self.port = 80
+        else: # no proxy
+            host_and_port = url_parsed[ 1 ].split (':')
+            self.host_header = None
+            self.host = host_and_port[ 0 ]
+            if len (host_and_port) > 1:
+                self.port = int(host_and_port[ 1 ])
+            else:
+                self.port = 80
+
+        self.connection = False
+        self.connect ()
+
+
+    def log (self, msg, depth=0):
+        """ log docstring """
+        if dav_debug and depth == 0:
+            host = str (self.init_url)
+            if host == 'http://int.tuco.lindenlab.com:80/asset/':
+                host = 'tuco'
+            if host == 'http://harriet.lindenlab.com/asset-keep/':
+                host = 'harriet/asset-keep'
+            if host == 'http://harriet.lindenlab.com/asset-flag/':
+                host = 'harriet/asset-flag'
+            if host == 'http://harriet.lindenlab.com/asset/':
+                host = 'harriet/asset'
+            if host == 'http://ozzy.lindenlab.com/asset/':
+                host = 'ozzy/asset'
+            if host == 'http://station11.lindenlab.com:12041/:':
+                host = 'station11:12041'
+            proxy = str (self.init_proxy)
+            if proxy == 'None':
+                proxy = ''
+            if proxy == 'http://int.tuco.lindenlab.com:3128/':
+                proxy = 'tuco'
+            syslog.syslog ('WebDAV (%s:%s) %s' % (host, proxy, str (msg)))
+
+
+    def connect (self):
+        """ connect docstring """
+        self.log ('connect')
+        self.connection = httplib.HTTPConnection (self.host, self.port)
+
+    def __err (self, response, details):
+        """ __err docstring """
+        raise DAVError (response.status, response.reason, response.read (),
+                        str (self.init_url) + ':' + \
+                        str (self.init_proxy) + ':' + str (details))
+
+    def request (self, method, path, body=None, headers=None,
+                 read_all=True, body_hook = None, recurse=0, allow_cache=True):
+        """ request docstring """
+        # self.log ('request %s %s' % (method, path))
+        if headers == None:
+            headers = {}
+        if not allow_cache:
+            headers['Pragma'] = 'no-cache'
+            headers['cache-control'] = 'no-cache'
+        try:
+            if method.lower () != 'purge':
+                if path.startswith ('/'):
+                    path = path[1:]
+                if self.host_header: # use proxy
+                    headers[ 'host' ] = self.host_header
+                    fullpath = 'http://%s%s%s' % (self.host_header,
+                                                  self.top_path, path)
+                else: # no proxy
+                    fullpath = self.top_path + path
+            else:
+                fullpath = path
+
+            self.connection.request (method, fullpath, body, headers)
+            if body_hook:
+                body_hook ()
+
+            # signal.signal (signal.SIGALRM, alarm_handler)
+            # try:
+            #     signal.alarm (120)
+            #     signal.alarm (0)
+            # except Timeout, e:
+            #     if recurse < 6:
+            #         return self.retry_request (method, path, body, headers,
+            #                                    read_all, body_hook, recurse)
+            #     else:
+            #         raise DAVError (0, 'timeout', self.host,
+            #                         (method, path, body, headers, recurse))
+
+            response = self.connection.getresponse ()
+
+            if read_all:
+                while len (response.read (1024)) > 0:
+                    pass
+            if (response.status == 500 or \
+                response.status == 503 or \
+                response.status == 403) and \
+                recurse < self.retries_before_fail:
+                return self.retry_request (method, path, body, headers,
+                                           read_all, body_hook, recurse)
+            return response
+        except (httplib.ResponseNotReady,
+                httplib.BadStatusLine,
+                socket.error):
+            # if the server hangs up on us (keepalive off, broken pipe),
+            # we need to reconnect and try again.
+            if recurse < self.retries_before_fail:
+                return self.retry_request (method, path, body, headers,
+                                           read_all, body_hook, recurse)
+            raise DAVError (0, 'reconnect failed', self.host,
+                            (method, path, body, headers, recurse))
+
+
+    def retry_request (self, method, path, body, headers,
+                       read_all, body_hook, recurse):
+        """ retry_request docstring """
+        time.sleep (10.0 * recurse)
+        self.connect ()
+        return self.request (method, path, body, headers,
+                             read_all, body_hook, recurse+1)
+
+
+
+    def propfind (self, path, body=None, depth=1):
+        """ propfind docstring """
+        # self.log ('propfind %s' % path)
+        headers = {'Content-Type':'text/xml; charset="utf-8"',
+                   'Depth':str(depth)}
+        response = self.request ('PROPFIND', path, body, headers, False)
+        if response.status == 207:
+            return response # Multi-Status
+        self.__err (response, ('PROPFIND', path, body, headers, 0))
+
+
+    def purge (self, path):
+        """ issue a squid purge command """
+        headers = {'Accept':'*/*'}
+        response = self.request ('PURGE', path, None, headers)
+        if response.status == 200 or response.status == 404:
+            # 200 if it was purge, 404 if it wasn't there.
+            return response
+        self.__err (response, ('PURGE', path, None, headers))
+
+
+    def get_file_size (self, path):
+        """
+        Use propfind to ask a webdav server what the size of
+        a file is.  If used on a directory (collection) return 0
+        """
+        self.log ('get_file_size %s' % path)
+        # "getcontentlength" property
+        # 8.1.1 Example - Retrieving Named Properties
+        # http://docs.python.org/lib/module-xml.dom.html
+        nsurl = 'http://apache.org/dav/props/'
+        doc = xml.dom.minidom.Document ()
+        propfind_element = doc.createElementNS (nsurl, "D:propfind")
+        propfind_element.setAttributeNS (nsurl, 'xmlns:D', 'DAV:')
+        doc.appendChild (propfind_element)
+        prop_element = doc.createElementNS (nsurl, "D:prop")
+        propfind_element.appendChild (prop_element)
+        con_len_element = doc.createElementNS (nsurl, "D:getcontentlength")
+        prop_element.appendChild (con_len_element)
+
+        response = self.propfind (path, doc.toxml ())
+        doc.unlink ()
+
+        resp_doc = xml.dom.minidom.parseString (response.read ())
+        cln = resp_doc.getElementsByTagNameNS ('DAV:','getcontentlength')[ 0 ]
+        try:
+            content_length = int (cln.childNodes[ 0 ].nodeValue)
+        except IndexError:
+            return 0
+        resp_doc.unlink ()
+        return content_length
+
+
+    def file_exists (self, path):
+        """
+        do an http head on the given file.  return True if it succeeds
+        """
+        self.log ('file_exists %s' % path)
+        expect_gzip = path.endswith ('.gz')
+        response = self.request ('HEAD', path)
+        got_gzip = response.getheader ('Content-Encoding', '').strip ()
+        if got_gzip.lower () == 'x-gzip' and expect_gzip == False:
+            # the asset server fakes us out if we ask for the non-gzipped
+            # version of an asset, but the server has the gzipped version.
+            return False
+        return response.status == 200
+
+
+    def mkdir (self, path):
+        """ mkdir docstring """
+        self.log ('mkdir %s' % path)
+        headers = {}
+        response = self.request ('MKCOL', path, None, headers)
+        if response.status == 201:
+            return # success
+        if response.status == 405:
+            return # directory already existed?
+        self.__err (response, ('MKCOL', path, None, headers, 0))
+
+
+    def delete (self, path):
+        """ delete docstring """
+        self.log ('delete %s' % path)
+        headers = {'Depth':'infinity'} # collections require infinity
+        response = self.request ('DELETE', path, None, headers)
+        if response.status == 204:
+            return # no content
+        if response.status == 404:
+            return # hmm
+        self.__err (response, ('DELETE', path, None, headers, 0))
+
+
+    def list_directory (self, path, dir_filter=None, allow_cache=True,
+                        minimum_cache_time=False):
+        """
+        Request an http directory listing and parse the filenames out of lines
+        like: '<LI><A HREF="X"> X</A>'. If a filter function is provided,
+        only return filenames that the filter returns True for.
+
+        This is sort of grody, but it seems faster than other ways of getting
+        this information from an isilon.
+        """
+        self.log ('list_directory %s' % path)
+
+        def try_match (lline, before, after):
+            """ try_match docstring """
+            try:
+                blen = len (before)
+                asset_start_index = lline.index (before)
+                asset_end_index = lline.index (after, asset_start_index + blen)
+                asset = line[ asset_start_index + blen : asset_end_index ]
+
+                if not dir_filter or dir_filter (asset):
+                    return [ asset ]
+                return []
+            except ValueError:
+                return []
+
+        if len (path) > 0 and path[-1:] != '/':
+            path += '/'
+
+        response = self.request ('GET', path, None, {}, False,
+                                 allow_cache=allow_cache)
+
+        if allow_cache and minimum_cache_time: # XXX
+            print response.getheader ('Date')
+            # s = "2005-12-06T12:13:14"
+            # from datetime import datetime
+            # from time import strptime
+            # datetime(*strptime(s, "%Y-%m-%dT%H:%M:%S")[0:6])
+            # datetime.datetime(2005, 12, 6, 12, 13, 14)
+
+        if response.status != 200:
+            self.__err (response, ('GET', path, None, {}, 0))
+        assets = []
+        for line in response.read ().split ('\n'):
+            lline = line.lower ()
+            if lline.find ("parent directory") == -1:
+                # isilon file
+                assets += try_match (lline, '<li><a href="', '"> ')
+                # apache dir
+                assets += try_match (lline, 'alt="[dir]"> <a href="', '/">')
+                # apache file
+                assets += try_match (lline, 'alt="[   ]"> <a href="', '">')
+        return assets
+
+
+    def __tmp_filename (self, path_and_file):
+        """ __tmp_filename docstring """
+        head, tail = os.path.split (path_and_file)
+        if head != '':
+            return head + '/.' + tail + '.' + str (os.getpid ())
+        else:
+            return head + '.' + tail + '.' + str (os.getpid ())
+
+
+    def __put__ (self, filesize, body_hook, remotefile):
+        """ __put__ docstring """
+        headers = {'Content-Length' : str (filesize)}
+        remotefile_tmp = self.__tmp_filename (remotefile)
+        response = self.request ('PUT', remotefile_tmp, None,
+                                 headers, True, body_hook)
+        if not response.status in (201, 204): # created, no content
+            self.__err (response, ('PUT', remotefile, None, headers, 0))
+        if filesize != self.get_file_size (remotefile_tmp):
+            try:
+                self.delete (remotefile_tmp)
+            except:
+                pass
+            raise DAVError (0, 'tmp upload error', remotefile_tmp)
+        # move the file to its final location
+        try:
+            self.rename (remotefile_tmp, remotefile)
+        except DAVError, exc:
+            if exc.status == 403: # try to clean up the tmp file
+                try:
+                    self.delete (remotefile_tmp)
+                except:
+                    pass
+            raise
+        if filesize != self.get_file_size (remotefile):
+            raise DAVError (0, 'file upload error', str (remotefile_tmp))
+
+
+    def put_string (self, strng, remotefile):
+        """ put_string docstring """
+        self.log ('put_string %d -> %s' % (len (strng), remotefile))
+        filesize = len (strng)
+        def body_hook ():
+            """ body_hook docstring """
+            self.connection.send (strng)
+        self.__put__ (filesize, body_hook, remotefile)
+
+
+    def put_file (self, localfile, remotefile):
+        """
+        Send a local file to a remote webdav store.  First, upload to
+        a temporary filename.  Next make sure the file is the size we
+        expected.  Next, move the file to its final location.  Next,
+        check the file size at the final location.
+        """
+        self.log ('put_file %s -> %s' % (localfile, remotefile))
+        filesize = os.path.getsize (localfile)
+        def body_hook ():
+            """ body_hook docstring """
+            handle = open (localfile)
+            while True:
+                data = handle.read (1300)
+                if len (data) == 0:
+                    break
+                self.connection.send (data)
+            handle.close ()
+        self.__put__ (filesize, body_hook, remotefile)
+
+
+    def create_empty_file (self, remotefile):
+        """ create an empty file """
+        self.log ('touch_file %s' % (remotefile))
+        headers = {'Content-Length' : '0'}
+        response = self.request ('PUT', remotefile, None, headers)
+        if not response.status in (201, 204): # created, no content
+            self.__err (response, ('PUT', remotefile, None, headers, 0))
+        if self.get_file_size (remotefile) != 0:
+            raise DAVError (0, 'file upload error', str (remotefile))
+
+
+    def __get_file_setup (self, remotefile, check_size=True):
+        """ __get_file_setup docstring """
+        if check_size:
+            remotesize = self.get_file_size (remotefile)
+        response = self.request ('GET', remotefile, None, {}, False)
+        if response.status != 200:
+            self.__err (response, ('GET', remotefile, None, {}, 0))
+        try:
+            content_length = int (response.getheader ("Content-Length"))
+        except TypeError:
+            content_length = None
+        if check_size:
+            if content_length != remotesize:
+                raise DAVError (0, 'file DL size error', remotefile)
+        return (response, content_length)
+
+
+    def __get_file_read (self, writehandle, response, content_length):
+        """ __get_file_read docstring """
+        if content_length != None:
+            so_far_length = 0
+            while so_far_length < content_length:
+                data = response.read (content_length - so_far_length)
+                if len (data) == 0:
+                    raise DAVError (0, 'short file download')
+                so_far_length += len (data)
+                writehandle.write (data)
+            while len (response.read ()) > 0:
+                pass
+        else:
+            while True:
+                data = response.read ()
+                if (len (data) < 1):
+                    break
+                writehandle.write (data)
+
+
+    def get_file (self, remotefile, localfile, check_size=True):
+        """
+        Get a remote file from a webdav server.  Download to a local
+        tmp file, then move into place.  Sanity check file sizes as
+        we go.
+        """
+        self.log ('get_file %s -> %s' % (remotefile, localfile))
+        (response, content_length) = \
+                   self.__get_file_setup (remotefile, check_size)
+        localfile_tmp = self.__tmp_filename (localfile)
+        handle = open (localfile_tmp, 'w')
+        self.__get_file_read (handle, response, content_length)
+        handle.close ()
+        if check_size:
+            if content_length != os.path.getsize (localfile_tmp):
+                raise DAVError (0, 'file DL size error',
+                                remotefile+','+localfile)
+        os.rename (localfile_tmp, localfile)
+
+
+    def get_file_as_string (self, remotefile, check_size=True):
+        """
+        download a file from a webdav server and return it as a string.
+        """
+        self.log ('get_file_as_string %s' % remotefile)
+        (response, content_length) = \
+                   self.__get_file_setup (remotefile, check_size)
+        # (tmp_handle, tmp_filename) = tempfile.mkstemp ()
+        tmp_handle = os.tmpfile ()
+        self.__get_file_read (tmp_handle, response, content_length)
+        tmp_handle.seek (0)
+        ret = tmp_handle.read ()
+        tmp_handle.close ()
+        # os.unlink (tmp_filename)
+        return ret
+
+
+    def get_post_as_string (self, remotefile, body):
+        """
+        Do an http POST, send body, get response and return it.
+        """
+        self.log ('get_post_as_string %s' % remotefile)
+        # headers = {'Content-Type':'application/x-www-form-urlencoded'}
+        headers = {'Content-Type':'text/xml; charset="utf-8"'}
+        # b64body = urlsafe_b64encode (asset_url)
+        response = self.request ('POST', remotefile, body, headers, False)
+        if response.status != 200:
+            self.__err (response, ('POST', remotefile, body, headers, 0))
+        try:
+            content_length = int (response.getheader ('Content-Length'))
+        except TypeError:
+            content_length = None
+        tmp_handle = os.tmpfile ()
+        self.__get_file_read (tmp_handle, response, content_length)
+        tmp_handle.seek (0)
+        ret = tmp_handle.read ()
+        tmp_handle.close ()
+        return ret
+
+
+    def __destination_command (self, verb, remotesrc, dstdav, remotedst):
+        """
+        self and dstdav should point to the same http server.
+        """
+        if len (remotedst) > 0 and remotedst[ 0 ] == '/':
+            remotedst = remotedst[1:]
+        headers = {'Destination': 'http://%s:%d%s%s' % (dstdav.host,
+                                                        dstdav.port,
+                                                        dstdav.top_path,
+                                                        remotedst)}
+        response = self.request (verb, remotesrc, None, headers)
+        if response.status == 201:
+            return # created
+        if response.status == 204:
+            return # no content
+        self.__err (response, (verb, remotesrc, None, headers, 0))
+
+
+    def rename (self, remotesrc, remotedst):
+        """ rename a file on a webdav server """
+        self.log ('rename %s -> %s' % (remotesrc, remotedst))
+        self.__destination_command ('MOVE', remotesrc, self, remotedst)
+    def xrename (self, remotesrc, dstdav, remotedst):
+        """ rename a file on a webdav server """
+        self.log ('xrename %s -> %s' % (remotesrc, remotedst))
+        self.__destination_command ('MOVE', remotesrc, dstdav, remotedst)
+
+
+    def copy (self, remotesrc, remotedst):
+        """ copy a file on a webdav server """
+        self.log ('copy %s -> %s' % (remotesrc, remotedst))
+        self.__destination_command ('COPY', remotesrc, self, remotedst)
+    def xcopy (self, remotesrc, dstdav, remotedst):
+        """ copy a file on a webdav server """
+        self.log ('xcopy %s -> %s' % (remotesrc, remotedst))
+        self.__destination_command ('COPY', remotesrc, dstdav, remotedst)
+
+
+def put_string (data, url):
+    """
+    upload string s to a url
+    """
+    url_parsed = urlparse.urlsplit (url)
+    dav = WebDAV ('%s://%s/' % (url_parsed[ 0 ], url_parsed[ 1 ]))
+    dav.put_string (data, url_parsed[ 2 ])
+
+
+def get_string (url, check_size=True):
+    """
+    return the contents of a url as a string
+    """
+    url_parsed = urlparse.urlsplit (url)
+    dav = WebDAV ('%s://%s/' % (url_parsed[ 0 ], url_parsed[ 1 ]))
+    return dav.get_file_as_string (url_parsed[ 2 ], check_size)
author	Seth Alves <seth@lindenlab.com>	2007-07-25 17:35:44 +0000
committer	Seth Alves <seth@lindenlab.com>	2007-07-25 17:35:44 +0000
commit	87955ce1eb686340ec7a3bca6ab1a02495eb5b9d (patch)
tree	152564ba9b5b2192163cfb6c4210204ae3e20c98 /indra
parent	1f4ffb0d7a6f94ff8daa9021fccbd5062010ac85 (diff)