Implemented HTTP retry for requests. Went in rather easily which

surprised me. Added a retry queue similar to ready queue to the policy object which is sorted by retry time. Currently do five retries (after the initial try) delayed by .25, .5, 1, 2 and 5 seconds. Removed the retry logic from the lltexturefetch module. Upped the waiting time in the unit test for the retries. People won't like this but tough, need tests.
author: Monty Brandenberg <monty@lindenlab.com> 2012-06-08 20:21:54 -0400
committer: Monty Brandenberg <monty@lindenlab.com> 2012-06-08 20:21:54 -0400
commit: 28a04400b4160dd34166483ddcf0c12637bcc363 (patch)
tree: e43ca202df6ed9bc8cbcc4a97d354e4798b7aa62 /indra
parent: 1e3d05329f2e823191c7c91926bee5ec9e5dc4d7 (diff)
12 files changed, 402 insertions, 94 deletions
diff --git a/indra/llcorehttp/_httplibcurl.cpp b/indra/llcorehttp/_httplibcurl.cpp
index 5272c391e8..05b2c2be69 100644
--- a/indra/llcorehttp/_httplibcurl.cpp
+++ b/indra/llcorehttp/_httplibcurl.cpp
@@ -29,6 +29,7 @@
 #include "httpheaders.h"
 #include "bufferarray.h"
 #include "_httpoprequest.h"
+#include "_httppolicy.h"
 
 
 namespace LLCore
@@ -85,6 +86,8 @@ void HttpLibcurl::term()
 
 HttpService::ELoopSpeed HttpLibcurl::processTransport()
 {
+	HttpService::ELoopSpeed	ret(HttpService::REQUEST_SLEEP);
+
 	// Give libcurl some cycles to do I/O & callbacks
 	for (int policy_class(0); policy_class < HttpRequest::POLICY_CLASS_LIMIT; ++policy_class)
 	{
@@ -110,7 +113,8 @@ HttpService::ELoopSpeed HttpLibcurl::processTransport()
 				CURL * handle(msg->easy_handle);
 				CURLcode result(msg->data.result);
 
-				completeRequest(mMultiHandles[policy_class], handle, result);
+				HttpService::ELoopSpeed	speed(completeRequest(mMultiHandles[policy_class], handle, result));
+				ret = (std::min)(ret, speed);
 				handle = NULL;			// No longer valid on return
 			}
 			else if (CURLMSG_NONE == msg->msg)
@@ -127,7 +131,11 @@ HttpService::ELoopSpeed HttpLibcurl::processTransport()
 		}
 	}
 
-	return mActiveOps.empty() ? HttpService::REQUEST_SLEEP : HttpService::NORMAL;
+	if (! mActiveOps.empty())
+	{
+		ret = (std::min)(ret, HttpService::NORMAL);
+	}
+	return ret;
 }
 
 
@@ -153,8 +161,12 @@ void HttpLibcurl::addOp(HttpOpRequest * op)
 }
 
 
-void HttpLibcurl::completeRequest(CURLM * multi_handle, CURL * handle, CURLcode status)
+HttpService::ELoopSpeed HttpLibcurl::completeRequest(CURLM * multi_handle, CURL * handle, CURLcode status)
 {
+	static const HttpStatus cant_connect(HttpStatus::EXT_CURL_EASY, CURLE_COULDNT_CONNECT);
+	static const HttpStatus cant_res_proxy(HttpStatus::EXT_CURL_EASY, CURLE_COULDNT_RESOLVE_PROXY);
+	static const HttpStatus cant_res_host(HttpStatus::EXT_CURL_EASY, CURLE_COULDNT_RESOLVE_HOST);
+
 	HttpOpRequest * op(NULL);
 	curl_easy_getinfo(handle, CURLINFO_PRIVATE, &op);
 	// *FIXME:  check the pointer
@@ -190,10 +202,7 @@ void HttpLibcurl::completeRequest(CURLM * multi_handle, CURL * handle, CURLcode
 		int http_status(200);
 
 		curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &http_status);
-		op->mStatus = LLCore::HttpStatus(http_status,
-										 (http_status >= 200 && http_status <= 299
-										  ? HE_SUCCESS
-										  : HE_REPLY_ERROR));
+		op->mStatus = LLCore::HttpStatus(http_status);
 	}
 
 	// Detach from multi and recycle handle
@@ -201,9 +210,42 @@ void HttpLibcurl::completeRequest(CURLM * multi_handle, CURL * handle, CURLcode
 	curl_easy_cleanup(handle);
 	op->mCurlHandle = NULL;
 	
-	// Deliver to reply queue and release
+	// Retry or finalize
+	if (! op->mStatus)
+	{
+		// If this failed, we might want to retry.  Have to inspect
+		// the status a little more deeply for those reasons worth retrying...
+		if (op->mPolicyRetries < op->mPolicyRetryLimit &&
+			((op->mStatus.isHttpStatus() && op->mStatus.mType >= 499 && op->mStatus.mType <= 599) ||
+			 cant_connect == op->mStatus ||
+			 cant_res_proxy == op->mStatus ||
+			 cant_res_host == op->mStatus))
+		{
+			// Okay, worth a retry.  We include 499 in this test as
+			// it's the old 'who knows?' error from many grid services...
+			HttpPolicy & policy(mService->getPolicy());
+		
+			policy.retryOp(op);
+			return HttpService::NORMAL;			// Having pushed to retry, keep things running
+		}
+	}
+
+	// This op is done, finalize it delivering it to the reply queue...
+	if (! op->mStatus)
+	{
+		LL_WARNS("CoreHttp") << "URL op failed after " << op->mPolicyRetries
+							 << " retries.  Reason:  " << op->mStatus.toString()
+							 << LL_ENDL;
+	}
+	else if (op->mPolicyRetries)
+	{
+		LL_WARNS("CoreHttp") << "URL op succeeded after " << op->mPolicyRetries << " retries."
+							 << LL_ENDL;
+	}
+	
 	op->stageFromActive(mService);
 	op->release();
+	return HttpService::REQUEST_SLEEP;
 }
 
 
diff --git a/indra/llcorehttp/_httplibcurl.h b/indra/llcorehttp/_httplibcurl.h
index ec325c1946..fe628b9ab0 100644
--- a/indra/llcorehttp/_httplibcurl.h
+++ b/indra/llcorehttp/_httplibcurl.h
@@ -83,7 +83,9 @@ public:
 protected:
 	/// Invoked when libcurl has indicated a request has been processed
 	/// to completion and we need to move the request to a new state.
-	void completeRequest(CURLM * multi_handle, CURL * handle, CURLcode status);
+	HttpService::ELoopSpeed completeRequest(CURLM * multi_handle,
+											CURL * handle,
+											CURLcode status);
 	
 protected:
 	typedef std::set<HttpOpRequest *> active_set_t;
diff --git a/indra/llcorehttp/_httpoprequest.cpp b/indra/llcorehttp/_httpoprequest.cpp
index 4bdc4a5257..895629c514 100644
--- a/indra/llcorehttp/_httpoprequest.cpp
+++ b/indra/llcorehttp/_httpoprequest.cpp
@@ -99,8 +99,15 @@ HttpOpRequest::HttpOpRequest()
 	  mReplyBody(NULL),
 	  mReplyOffset(0),
 	  mReplyLength(0),
-	  mReplyHeaders(NULL)
-{}
+	  mReplyHeaders(NULL),
+	  mPolicyRetries(0),
+	  mPolicyRetryAt(HttpTime(0)),
+	  mPolicyRetryLimit(5)				// *FIXME:  Get from policy definitions
+{
+	// *NOTE:  As members are added, retry initialization/cleanup
+	// may need to be extended in @prepareRequest().
+}
+
 
 
 HttpOpRequest::~HttpOpRequest()
@@ -130,7 +137,6 @@ HttpOpRequest::~HttpOpRequest()
 	}
 
 	mCurlService = NULL;
-	
 
 	if (mCurlHeaders)
 	{
@@ -313,6 +319,30 @@ HttpStatus HttpOpRequest::setupPost(HttpRequest::policy_t policy_id,
 
 HttpStatus HttpOpRequest::prepareRequest(HttpService * service)
 {
+	// Scrub transport and result data for retried op case
+	mCurlActive = false;
+	mCurlHandle = NULL;
+	mCurlService = NULL;
+	if (mCurlHeaders)
+	{
+		curl_slist_free_all(mCurlHeaders);
+		mCurlHeaders = NULL;
+	}
+	mCurlBodyPos = 0;
+
+	if (mReplyBody)
+	{
+		mReplyBody->release();
+		mReplyBody = NULL;
+	}
+	mReplyOffset = 0;
+	mReplyLength = 0;
+	if (mReplyHeaders)
+	{
+		mReplyHeaders->release();
+		mReplyHeaders = NULL;
+	}
+	
 	// *FIXME:  better error handling later
 	HttpStatus status;
 
@@ -321,6 +351,7 @@ HttpStatus HttpOpRequest::prepareRequest(HttpService * service)
 	
 	mCurlHandle = curl_easy_init();
 	// curl_easy_setopt(mCurlHandle, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(mCurlHandle, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
 	curl_easy_setopt(mCurlHandle, CURLOPT_TIMEOUT, 30);
 	curl_easy_setopt(mCurlHandle, CURLOPT_CONNECTTIMEOUT, 30);
 	curl_easy_setopt(mCurlHandle, CURLOPT_NOSIGNAL, 1);
@@ -403,12 +434,7 @@ HttpStatus HttpOpRequest::prepareRequest(HttpService * service)
 		break;
 	}
 	
-	if (mReqHeaders)
-	{
-		mCurlHeaders = append_headers_to_slist(mReqHeaders, mCurlHeaders);
-	}
-	mCurlHeaders = curl_slist_append(mCurlHeaders, "Pragma:");
-	
+	// There's a CURLOPT for this now...
 	if ((mReqOffset || mReqLength) && HOR_GET == mReqMethod)
 	{
 		static const char * const fmt1("Range: bytes=%lu-%lu");
@@ -428,6 +454,13 @@ HttpStatus HttpOpRequest::prepareRequest(HttpService * service)
 		range_line[sizeof(range_line) - 1] = '\0';
 		mCurlHeaders = curl_slist_append(mCurlHeaders, range_line);
 	}
+
+	mCurlHeaders = curl_slist_append(mCurlHeaders, "Pragma:");
+	if (mReqHeaders)
+	{
+		// Caller's headers last to override
+		mCurlHeaders = append_headers_to_slist(mReqHeaders, mCurlHeaders);
+	}
 	curl_easy_setopt(mCurlHandle, CURLOPT_HTTPHEADER, mCurlHeaders);
 	
 	if (mProcFlags & (PF_SCAN_RANGE_HEADER | PF_SAVE_HEADERS))
diff --git a/indra/llcorehttp/_httpoprequest.h b/indra/llcorehttp/_httpoprequest.h
index 0cad4e8459..6dcf30ca0c 100644
--- a/indra/llcorehttp/_httpoprequest.h
+++ b/indra/llcorehttp/_httpoprequest.h
@@ -128,6 +128,11 @@ public:
 	off_t				mReplyOffset;
 	size_t				mReplyLength;
 	HttpHeaders *		mReplyHeaders;
+
+	// Policy data
+	int					mPolicyRetries;
+	HttpTime			mPolicyRetryAt;
+	const int			mPolicyRetryLimit;
 };  // end class HttpOpRequest
 
 
diff --git a/indra/llcorehttp/_httppolicy.cpp b/indra/llcorehttp/_httppolicy.cpp
index 51f5e487dc..1f4cd34a4b 100644
--- a/indra/llcorehttp/_httppolicy.cpp
+++ b/indra/llcorehttp/_httppolicy.cpp
@@ -24,39 +24,46 @@
  * $/LicenseInfo$
  */
 
+#include "linden_common.h"
+
 #include "_httppolicy.h"
 
 #include "_httpoprequest.h"
 #include "_httpservice.h"
 #include "_httplibcurl.h"
 
+#include "lltimer.h"
+
 
 namespace LLCore
 {
 
 HttpPolicy::HttpPolicy(HttpService * service)
 	: mService(service)
-{
-	for (int policy_class(0); policy_class < HttpRequest::POLICY_CLASS_LIMIT; ++policy_class)
-	{
-		mReadyInClass[policy_class] = 0;
-	}
-}
+{}
 
 
 HttpPolicy::~HttpPolicy()
 {
-	for (int policy_class(0); policy_class < HttpRequest::POLICY_CLASS_LIMIT; ++policy_class)
+	for (int policy_class(0); policy_class < LL_ARRAY_SIZE(mState); ++policy_class)
 	{
-		HttpReadyQueue & readyq(mReadyQueue[policy_class]);
+		HttpRetryQueue & retryq(mState[policy_class].mRetryQueue);
+		while (! retryq.empty())
+		{
+			HttpOpRequest * op(retryq.top());
 		
+			op->cancel();
+			op->release();
+			retryq.pop();
+		}
+
+		HttpReadyQueue & readyq(mState[policy_class].mReadyQueue);
 		while (! readyq.empty())
 		{
 			HttpOpRequest * op(readyq.top());
 		
 			op->cancel();
 			op->release();
-			mReadyInClass[policy_class]--;
 			readyq.pop();
 		}
 	}
@@ -68,27 +75,69 @@ void HttpPolicy::addOp(HttpOpRequest * op)
 {
 	const int policy_class(op->mReqPolicy);
 	
-	mReadyQueue[policy_class].push(op);
-	++mReadyInClass[policy_class];
+	op->mPolicyRetries = 0;
+	mState[policy_class].mReadyQueue.push(op);
+}
+
+
+void HttpPolicy::retryOp(HttpOpRequest * op)
+{
+	static const HttpTime retry_deltas[] =
+		{
+			 250000,			// 1st retry in 0.25 S, etc...
+			 500000,
+			1000000,
+			2000000,
+			5000000				// ... to every 5.0 S.
+		};
+	static const int delta_max(int(LL_ARRAY_SIZE(retry_deltas)) - 1);
+	
+	const HttpTime now(totalTime());
+	const int policy_class(op->mReqPolicy);
+	
+	const HttpTime delta(retry_deltas[llclamp(op->mPolicyRetries, 0, delta_max)]);
+	op->mPolicyRetryAt = now + delta;
+	++op->mPolicyRetries;
+	LL_WARNS("CoreHttp") << "URL op retry #" << op->mPolicyRetries
+						 << " being scheduled for " << delta << " uSecs from now."
+						 << LL_ENDL;
+	mState[policy_class].mRetryQueue.push(op);
 }
 
 
 HttpService::ELoopSpeed HttpPolicy::processReadyQueue()
 {
+	const HttpTime now(totalTime());
 	HttpService::ELoopSpeed result(HttpService::REQUEST_SLEEP);
 	HttpLibcurl & transport(mService->getTransport());
 	
-	for (int policy_class(0); policy_class < HttpRequest::POLICY_CLASS_LIMIT; ++policy_class)
+	for (int policy_class(0); policy_class < LL_ARRAY_SIZE(mState); ++policy_class)
 	{
-		HttpReadyQueue & readyq(mReadyQueue[policy_class]);
 		int active(transport.getActiveCountInClass(policy_class));
 		int needed(8 - active);
 
-		if (needed > 0 && mReadyInClass[policy_class] > 0)
+		HttpRetryQueue & retryq(mState[policy_class].mRetryQueue);
+		HttpReadyQueue & readyq(mState[policy_class].mReadyQueue);
+		
+		if (needed > 0)
 		{
-			// Scan ready queue for requests that match policy
-
-			while (! readyq.empty() && needed > 0 && mReadyInClass[policy_class] > 0)
+			// First see if we have any retries...
+			while (needed > 0 && ! retryq.empty())
+			{
+				HttpOpRequest * op(retryq.top());
+				if (op->mPolicyRetryAt > now)
+					break;
+			
+				retryq.pop();
+				
+				op->stageFromReady(mService);
+				op->release();
+					
+				--needed;
+			}
+		
+			// Now go on to the new requests...
+			while (needed > 0 && ! readyq.empty())
 			{
 				HttpOpRequest * op(readyq.top());
 				readyq.pop();
@@ -96,17 +145,16 @@ HttpService::ELoopSpeed HttpPolicy::processReadyQueue()
 				op->stageFromReady(mService);
 				op->release();
 					
-				--mReadyInClass[policy_class];
 				--needed;
 			}
 		}
-
-		if (! readyq.empty())
+				
+		if (! readyq.empty() || ! retryq.empty())
 		{
 			// If anything is ready, continue looping...
 			result = (std::min)(result, HttpService::NORMAL);
 		}
-	}
+	} // end foreach policy_class
 
 	return result;
 }
@@ -114,9 +162,9 @@ HttpService::ELoopSpeed HttpPolicy::processReadyQueue()
 
 bool HttpPolicy::changePriority(HttpHandle handle, HttpRequest::priority_t priority)
 {
-	for (int policy_class(0); policy_class < HttpRequest::POLICY_CLASS_LIMIT; ++policy_class)
+	for (int policy_class(0); policy_class < LL_ARRAY_SIZE(mState); ++policy_class)
 	{
-		HttpReadyQueue::container_type & c(mReadyQueue[policy_class].get_container());
+		HttpReadyQueue::container_type & c(mState[policy_class].mReadyQueue.get_container());
 	
 		// Scan ready queue for requests that match policy
 		for (HttpReadyQueue::container_type::iterator iter(c.begin()); c.end() != iter;)
@@ -126,9 +174,9 @@ bool HttpPolicy::changePriority(HttpHandle handle, HttpRequest::priority_t prior
 			if (static_cast<HttpHandle>(*cur) == handle)
 			{
 				HttpOpRequest * op(*cur);
-				c.erase(cur);							// All iterators are now invalidated
+				c.erase(cur);									// All iterators are now invalidated
 				op->mReqPriority = priority;
-				mReadyQueue[policy_class].push(op);		// Re-insert using adapter class
+				mState[policy_class].mReadyQueue.push(op);		// Re-insert using adapter class
 				return true;
 			}
 		}
diff --git a/indra/llcorehttp/_httppolicy.h b/indra/llcorehttp/_httppolicy.h
index 425079ec63..6f18264f3d 100644
--- a/indra/llcorehttp/_httppolicy.h
+++ b/indra/llcorehttp/_httppolicy.h
@@ -31,6 +31,7 @@
 #include "httprequest.h"
 #include "_httpservice.h"
 #include "_httpreadyqueue.h"
+#include "_httpretryqueue.h"
 #include "_httppolicyglobal.h"
 
 
@@ -67,6 +68,14 @@ public:
 	/// additional references will be added.)
 	void addOp(HttpOpRequest *);
 
+	/// Similar to addOp, used when a caller wants to retry a
+	/// request that has failed.  It's placed on a special retry
+	/// queue but ordered by retry time not priority.  Otherwise,
+	/// handling is the same and retried operations are considered
+	/// before new ones but that doesn't guarantee completion
+	/// order.
+	void retryOp(HttpOpRequest *);
+
 	// Shadows HttpService's method
 	bool changePriority(HttpHandle handle, HttpRequest::priority_t priority);
 
@@ -77,10 +86,14 @@ public:
 			return mGlobalOptions;
 		}
 	
-	
 protected:
-	int					mReadyInClass[HttpRequest::POLICY_CLASS_LIMIT];
-	HttpReadyQueue		mReadyQueue[HttpRequest::POLICY_CLASS_LIMIT];
+	struct State
+	{
+		HttpReadyQueue		mReadyQueue;
+		HttpRetryQueue		mRetryQueue;
+	};
+
+	State				mState[HttpRequest::POLICY_CLASS_LIMIT];
 	HttpService *		mService;				// Naked pointer, not refcounted, not owner
 	HttpPolicyGlobal	mGlobalOptions;
 	
diff --git a/indra/llcorehttp/_httpreadyqueue.h b/indra/llcorehttp/_httpreadyqueue.h
index 2cd96aefe3..87828834dc 100644
--- a/indra/llcorehttp/_httpreadyqueue.h
+++ b/indra/llcorehttp/_httpreadyqueue.h
@@ -36,8 +36,6 @@
 namespace LLCore
 {
 
-class HttpOpRequest;
-
 /// HttpReadyQueue provides a simple priority queue for HttpOpRequest objects.
 ///
 /// This uses the priority_queue adaptor class to provide the queue
diff --git a/indra/llcorehttp/_httpretryqueue.h b/indra/llcorehttp/_httpretryqueue.h
new file mode 100644
index 0000000000..745adec09d
--- /dev/null
+++ b/indra/llcorehttp/_httpretryqueue.h
@@ -0,0 +1,94 @@
+/**
+ * @file _httpretryqueue.h
+ * @brief Internal declaration for the operation retry queue
+ *
+ * $LicenseInfo:firstyear=2012&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2012, Linden Research, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+#ifndef	_LLCORE_HTTP_RETRY_QUEUE_H_
+#define	_LLCORE_HTTP_RETRY_QUEUE_H_
+
+
+#include <queue>
+
+#include "_httpoprequest.h"
+
+
+namespace LLCore
+{
+
+/// HttpRetryQueue provides a simple priority queue for HttpOpRequest objects.
+///
+/// This uses the priority_queue adaptor class to provide the queue
+/// as well as the ordering scheme while allowing us access to the
+/// raw container if we follow a few simple rules.  One of the more
+/// important of those rules is that any iterator becomes invalid
+/// on element erasure.  So pay attention.
+///
+/// Threading:  not thread-safe.  Expected to be used entirely by
+/// a single thread, typically a worker thread of some sort.
+
+struct HttpOpRetryCompare
+{
+	bool operator()(const HttpOpRequest * lhs, const HttpOpRequest * rhs)
+		{
+			return lhs->mPolicyRetryAt < rhs->mPolicyRetryAt;
+		}
+};
+
+	
+typedef std::priority_queue<HttpOpRequest *,
+							std::deque<HttpOpRequest *>,
+							LLCore::HttpOpRetryCompare> HttpRetryQueueBase;
+
+class HttpRetryQueue : public HttpRetryQueueBase
+{
+public:
+	HttpRetryQueue()
+		: HttpRetryQueueBase()
+		{}
+	
+	~HttpRetryQueue()
+		{}
+	
+protected:
+	HttpRetryQueue(const HttpRetryQueue &);		// Not defined
+	void operator=(const HttpRetryQueue &);		// Not defined
+
+public:
+	const container_type & get_container() const
+		{
+			return c;
+		}
+
+	container_type & get_container()
+		{
+			return c;
+		}
+
+}; // end class HttpRetryQueue
+
+
+}  // end namespace LLCore
+
+
+#endif	// _LLCORE_HTTP_RETRY_QUEUE_H_
diff --git a/indra/llcorehttp/httpcommon.h b/indra/llcorehttp/httpcommon.h
index fd2661b700..42b75edb41 100644
--- a/indra/llcorehttp/httpcommon.h
+++ b/indra/llcorehttp/httpcommon.h
@@ -114,6 +114,9 @@ namespace LLCore
 typedef void * HttpHandle;
 #define LLCORE_HTTP_HANDLE_INVALID		(NULL)
 
+/// For internal scheduling and metrics, we use a microsecond
+/// timebase compatible with the environment.
+typedef U64 HttpTime;
 
 /// Error codes defined by the library itself as distinct from
 /// libcurl (or any other transport provider).
@@ -180,6 +183,15 @@ struct HttpStatus
 		  mStatus(status)
 		{}
 	
+	HttpStatus(int http_status)
+		: mType(http_status),
+		  mStatus(http_status >= 200 && http_status <= 299
+				  ? HE_SUCCESS
+				  : HE_REPLY_ERROR)
+		{
+			llassert(http_status >= 100 && http_status <= 999);
+		}
+	
 	HttpStatus(const HttpStatus & rhs)
 		: mType(rhs.mType),
 		  mStatus(rhs.mStatus)
diff --git a/indra/llcorehttp/tests/test_httprequest.hpp b/indra/llcorehttp/tests/test_httprequest.hpp
index 0e9d7d8979..2d91b95347 100644
--- a/indra/llcorehttp/tests/test_httprequest.hpp
+++ b/indra/llcorehttp/tests/test_httprequest.hpp
@@ -381,7 +381,7 @@ void HttpRequestTestObjectType::test<5>()
 
 	// Run the notification pump.
 	int count(0);
-	int limit(20);
+	int limit(180);				// With retries, can take more than 10 seconds to give up
 	while (count++ < limit && mHandlerCalls < 1)
 	{
 		req->update(1000);
diff --git a/indra/newview/lltexturefetch.cpp b/indra/newview/lltexturefetch.cpp
index 92b847345d..4a46ea0e97 100644
--- a/indra/newview/lltexturefetch.cpp
+++ b/indra/newview/lltexturefetch.cpp
@@ -67,9 +67,53 @@
 // This is an attempt to document what's going on in here after-the-fact.
 // It's a sincere attempt to be accurate but there will be mistakes.
 //
+//
 // Purpose
 //
-// (What is this solving?)
+// What is this module trying to do?  It accepts requests to load textures
+// at a given priority and discard level and notifies the caller when done
+// (successfully or not).  Additional constraints are:
+//
+// * Support a local texture cache.  Don't hit network when possible
+//   to avoid it.
+// * Use UDP or HTTP as directed or as fallback.  HTTP is tried when
+//   not disabled and a URL is available.  UDP when a URL isn't
+//   available or HTTP attempts fail.
+// * Asynchronous (using threads).  Main thread is not to be blocked or
+//   burdened.
+// * High concurrency.  Many requests need to be in-flight and at various
+//   stages of completion.
+// * Tolerate frequent re-prioritizations of requests.  Priority is
+//   a reflection of a camera's viewpoint and as that viewpoint changes,
+//   objects and textures become more and less relevant and that is
+//   expressed at this level by priority changes and request cancelations.
+//
+// The caller interfaces that fall out of the above and shape the
+// implementation are:
+// * createRequest - Load j2c image via UDP or HTTP at given discard level and priority
+// * deleteRequest - Request removal of prior request
+// * getRequestFinished - Test if request is finished returning data to caller
+// * updateRequestPriority - Change priority of existing request
+// * getFetchState - Retrieve progress on existing request
+//
+// Everything else in here is mostly plumbing, metrics and debug.
+//
+//
+// The Work Queue
+//
+// The two central classes are LLTextureFetch and LLTextureFetchWorker.
+// LLTextureFetch combines threading with a priority queue of work
+// requests.  The priority queue is sorted by a U32 priority derived
+// from the F32 priority in the APIs.  The *only* work request that
+// receives service time by this thread is the highest priority
+// request.  All others wait until it is complete or a dynamic priority
+// change has re-ordered work.
+//
+// LLTextureFetchWorker implements the work request and is 1:1 with
+// texture fetch requests.  Embedded in each is a state machine that
+// walks it through the cache, HTTP, UDP, image decode and retry
+// steps of texture acquisition.
+//
 //
 // Threads
 //
@@ -83,6 +127,7 @@
 // 5.  Tid      Image decoder's worker thread
 // 6.  Thl      HTTP library's worker thread
 //
+//
 // Mutexes/Condition Variables
 //
 // 1.  Mt       Mutex defined for LLThread's condition variable (base class of
@@ -98,6 +143,7 @@
 //              LLTextureFetchWorker).  One per request.
 // 7.  Mw       LLTextureFetchWorker's mutex.  One per request.
 //
+//
 // Lock Ordering Rules
 //
 // Not an exhaustive list but shows the order of lock acquisition
@@ -105,6 +151,8 @@
 // acquiring 'B'.
 //
 // 1.    Mw < Mfnq
+// (there are many more...)
+//
 //
 // Method and Member Definitions
 //
@@ -124,7 +172,10 @@
 // comment can mean the member is unlocked or that I didn't bother
 // to do the archaeology.  In the case of LLTextureFetchWorker,
 // most data members added by the leaf class are actually covered
-// by the Mw lock.
+// by the Mw lock.  You may also see "// T<xxx>" which means that
+// the member's usage is restricted to one thread (except for
+// perhaps construction and destruction) and so explicit locking
+// isn't used.
 //
 // In code, a trailing comment like "// [-+]M<xxx>" indicates a
 // lock acquision or release point.
@@ -132,27 +183,54 @@
 //
 // Worker Lifecycle
 //
-// (Can't unilaterally delete, cleanup is two-phase, etc.)
+// The threading and responder model makes it very likely that
+// other components are holding on to a pointer to a worker request.
+// So, uncoordinated deletions of requests is a guarantee of memory
+// corruption in a short time.  So destroying a request involves
+// invocations's of LLQueuedThread/LLWorkerThread's abort/stop
+// logic that removes workers and puts them ona delete queue for
+// 2-phase destruction.  That second phase is deferrable by calls
+// to deleteOK() which only allow final destruction (via dtor)
+// once deleteOK has determined that the request is in a safe
+// state.
+//
 //
 // Worker State Machine
 //
 // (ASCII art needed)
 //
+//
 // Priority Scheme
 //
 // [PRIORITY_LOW, PRIORITY_NORMAL)   - for WAIT_HTTP_RESOURCE state
 // [PRIORITY_NORMAL, PRIORITY_HIGH)  - waiting for external event
-// [PRIORITY_HIGH, PRIORITY_URGENT)  - rapidly transitioning through states,
+// [PRIORITY_HIGH, PRIORITY_URGENT)  - External event delivered,
+//                                     rapidly transitioning through states,
 //                                     no waiting allowed
 //
+// By itself, the above work queue model would fail the concurrency
+// and liveness requirements of the interface.  A high priority
+// request could find itself on the head and stalled for external
+// reasons (see VWR-28996).  So a few additional constraints are
+// required to keep things running:
+// * Anything that can make forward progress must be kept at a
+//   higher priority than anything that can't.
+// * On completion of external events, the associated request
+//   needs to be elevated beyond the normal range to handle
+//   any data delivery and release any external resource.
+//
+// This effort is made to keep higher-priority entities moving
+// forward in their state machines at every possible step of
+// processing.  It's not entirely proven that this produces the
+// experiencial benefits promised.
 //
 
 //////////////////////////////////////////////////////////////////////////////
 
 // Tuning/Parameterization Constants
 
-static const S32 HTTP_REQUESTS_IN_QUEUE_HIGH_WATER = 40;
-static const S32 HTTP_REQUESTS_IN_QUEUE_LOW_WATER = 20;
+static const S32 HTTP_REQUESTS_IN_QUEUE_HIGH_WATER = 40;		// Maximum requests to have active in HTTP
+static const S32 HTTP_REQUESTS_IN_QUEUE_LOW_WATER = 20;			// Active level at which to refill
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -425,7 +503,6 @@ private:
 	BOOL mInLocalCache;
 	bool mCanUseHTTP ;
 	bool mCanUseNET ; //can get from asset server.
-	S32 mHTTPFailCount;
 	S32 mRetryAttempt;
 	S32 mActiveCount;
 	LLCore::HttpStatus mGetStatus;
@@ -745,7 +822,6 @@ LLTextureFetchWorker::LLTextureFetchWorker(LLTextureFetch* fetcher,
 	  mHaveAllData(FALSE),
 	  mInLocalCache(FALSE),
 	  mCanUseHTTP(true),
-	  mHTTPFailCount(0),
 	  mRetryAttempt(0),
 	  mActiveCount(0),
 	  mWorkMutex(NULL),
@@ -936,6 +1012,9 @@ void LLTextureFetchWorker::startWork(S32 param)
 // Threads:  Ttf
 bool LLTextureFetchWorker::doWork(S32 param)
 {
+	static const LLCore::HttpStatus http_not_found(HTTP_NOT_FOUND);
+	static const LLCore::HttpStatus http_service_unavail(HTTP_SERVICE_UNAVAILABLE);
+	
 	// Release waiters while we aren't holding the Mw lock.
 	mFetcher->releaseHttpWaiters();
 	
@@ -1286,7 +1365,6 @@ bool LLTextureFetchWorker::doWork(S32 param)
 		{
 			llwarns << "HTTP GET request failed for " << mID << llendl;
 			resetFormattedData();
-			++mHTTPFailCount;
 			return true; // failed
 		}
 
@@ -1313,10 +1391,8 @@ bool LLTextureFetchWorker::doWork(S32 param)
 			S32 cur_size = mFormattedImage.notNull() ? mFormattedImage->getDataSize() : 0;
 			if (mRequestedSize < 0)
 			{
-				S32 max_attempts;
-				if (mGetStatus == LLCore::HttpStatus(HTTP_NOT_FOUND, LLCore::HE_REPLY_ERROR))
+				if (http_not_found == mGetStatus)
 				{
-					mHTTPFailCount = max_attempts = 1; // Don't retry
 					llwarns << "Texture missing from server (404): " << mUrl << llendl;
 
 					// roll back to try UDP
@@ -1328,47 +1404,32 @@ bool LLTextureFetchWorker::doWork(S32 param)
 						return false;
 					}
 				}
-				else if (mGetStatus == LLCore::HttpStatus(HTTP_SERVICE_UNAVAILABLE, LLCore::HE_REPLY_ERROR))
+				else if (http_service_unavail == mGetStatus)
 				{
-					// *TODO: Should probably introduce a timer here to delay future HTTP requsts
-					// for a short time (~1s) to ease server load? Ideally the server would queue
-					// requests instead of returning 503... we already limit the number pending.
-					++mHTTPFailCount;
-					max_attempts = mHTTPFailCount+1; // Keep retrying
 					LL_INFOS_ONCE("Texture") << "Texture server busy (503): " << mUrl << LL_ENDL;
 				}
 				else
 				{
-					const S32 HTTP_MAX_RETRY_COUNT = 3;
-					max_attempts = HTTP_MAX_RETRY_COUNT + 1;
-					++mHTTPFailCount;
 					llinfos << "HTTP GET failed for: " << mUrl
 							<< " Status: " << mGetStatus.toHex()
 							<< " Reason: '" << mGetReason << "'"
-							<< " Attempt:" << mHTTPFailCount+1 << "/" << max_attempts << llendl;
+						// *FIXME:  Add retry info for reporting purposes...
+						// << " Attempt:" << mHTTPFailCount+1 << "/" << max_attempts
+							<< llendl;
 				}
 
-				if (mHTTPFailCount >= max_attempts)
-				{
-					if (cur_size > 0)
-					{
-						// Use available data
-						mLoadedDiscard = mFormattedImage->getDiscardLevel();
-						mState = DECODE_IMAGE;
-						return false; 
-					}
-					else
-					{
-						resetFormattedData();
-						mState = DONE;
-						return true; // failed
-					}
-				}
-				else
+				if (cur_size > 0)
 				{
-					mState = SEND_HTTP_REQ;
-					return false; // retry
+					// Use available data
+					mLoadedDiscard = mFormattedImage->getDiscardLevel();
+					mState = DECODE_IMAGE;
+					return false; 
 				}
+
+				// Fail harder
+				resetFormattedData();
+				mState = DONE;
+				return true; // failed
 			}
 			
 			if (! mHttpBufferArray || ! mHttpBufferArray->size())
@@ -1649,7 +1710,7 @@ void LLTextureFetchWorker::onCompleted(LLCore::HttpHandle handle, LLCore::HttpRe
 	}
 	else
 	{
-		static const LLCore::HttpStatus par_status(LLCore::HttpStatus(HTTP_PARTIAL_CONTENT, LLCore::HE_SUCCESS));
+		static const LLCore::HttpStatus par_status(HTTP_PARTIAL_CONTENT);
 
 		partial = (par_status == status);
 	}
diff --git a/indra/newview/lltexturefetch.h b/indra/newview/lltexturefetch.h
index 53b0f7885f..4ee13d171e 100644
--- a/indra/newview/lltexturefetch.h
+++ b/indra/newview/lltexturefetch.h
@@ -308,9 +308,9 @@ private:
 	// Interfaces and objects into the core http library used
 	// to make our HTTP requests.  These replace the various
 	// LLCurl interfaces used in the past.
-	LLCore::HttpRequest *		mHttpRequest;
-	LLCore::HttpOptions *		mHttpOptions;
-	LLCore::HttpHeaders *		mHttpHeaders;
+	LLCore::HttpRequest *		mHttpRequest;							// Ttf
+	LLCore::HttpOptions *		mHttpOptions;							// Ttf
+	LLCore::HttpHeaders *		mHttpHeaders;							// Ttf
 
 	typedef std::set<LLUUID> wait_http_res_queue_t;
 	wait_http_res_queue_t		mHttpWaitResource;						// Mfnq
author	Monty Brandenberg <monty@lindenlab.com>	2012-06-08 20:21:54 -0400
committer	Monty Brandenberg <monty@lindenlab.com>	2012-06-08 20:21:54 -0400
commit	28a04400b4160dd34166483ddcf0c12637bcc363 (patch)
tree	e43ca202df6ed9bc8cbcc4a97d354e4798b7aa62 /indra
parent	1e3d05329f2e823191c7c91926bee5ec9e5dc4d7 (diff)