6 files changed, 344 insertions, 484 deletions
diff --git a/indra/llmath/llcamera.cpp b/indra/llmath/llcamera.cpp
index 9034182072..18d704dd0f 100644
--- a/indra/llmath/llcamera.cpp
+++ b/indra/llmath/llcamera.cpp
@@ -311,104 +311,6 @@ int LLCamera::sphereInFrustumQuick(const LLVector3 &sphere_center, const F32 rad
 	return 0;	
 }
 
-// HACK: This version is still around because the version below doesn't work
-// unless the agent planes are initialized.
-// Return 1 if sphere is in frustum, 2 if fully in frustum, otherwise 0.
-// NOTE: 'center' is in absolute frame.
-int LLCamera::sphereInFrustumOld(const LLVector3 &sphere_center, const F32 radius) const 
-{
-	// Returns 1 if sphere is in frustum, 0 if not.
-	// modified so that default view frust is along X with Z vertical
-	F32 x, y, z, rightDist, leftDist, topDist, bottomDist;
-
-	// Subtract the view position 
-	//LLVector3 relative_center;
-	//relative_center = sphere_center - getOrigin();
-	LLVector3 rel_center(sphere_center);
-	rel_center -= mOrigin;
-
-	bool all_in = TRUE;
-
-	// Transform relative_center.x to camera frame
-	x = mXAxis * rel_center;
-	if (x < MIN_NEAR_PLANE - radius)
-	{
-		return 0;
-	}
-	else if (x < MIN_NEAR_PLANE + radius)
-	{
-		all_in = FALSE;
-	}
-
-	if (x > mFarPlane + radius)
-	{
-		return 0;
-	}
-	else if (x > mFarPlane - radius)
-	{
-		all_in = FALSE;
-	}
-
-	// Transform relative_center.y to camera frame
-	y = mYAxis * rel_center;
-
-	// distance to plane is the dot product of (x, y, 0) * plane_normal
-	rightDist = x * mLocalPlanes[PLANE_RIGHT][VX] + y * mLocalPlanes[PLANE_RIGHT][VY];
-	if (rightDist < -radius)
-	{
-		return 0;
-	}
-	else if (rightDist < radius)
-	{
-		all_in = FALSE;
-	}
-
-	leftDist = x * mLocalPlanes[PLANE_LEFT][VX] + y * mLocalPlanes[PLANE_LEFT][VY];
-	if (leftDist < -radius)
-	{
-		return 0;
-	}
-	else if (leftDist < radius)
-	{
-		all_in = FALSE;
-	}
-
-	// Transform relative_center.y to camera frame
-	z = mZAxis * rel_center;
-
-	topDist = x * mLocalPlanes[PLANE_TOP][VX] + z * mLocalPlanes[PLANE_TOP][VZ];
-	if (topDist < -radius)
-	{
-		return 0;
-	}
-	else if (topDist < radius)
-	{
-		all_in = FALSE;
-	}
-
-	bottomDist = x * mLocalPlanes[PLANE_BOTTOM][VX] + z * mLocalPlanes[PLANE_BOTTOM][VZ];
-	if (bottomDist < -radius)
-	{
-		return 0;
-	}
-	else if (bottomDist < radius)
-	{
-		all_in = FALSE;
-	}
-
-	if (all_in)
-	{
-		return 2;
-	}
-
-	return 1;
-}
-
-
-// HACK: This (presumably faster) version only currently works if you set up the
-// frustum planes using GL.  At some point we should get those planes through another
-// mechanism, and then we can get rid of the "old" version above.
-
 // Return 1 if sphere is in frustum, 2 if fully in frustum, otherwise 0.
 // NOTE: 'center' is in absolute frame.
 int LLCamera::sphereInFrustum(const LLVector3 &sphere_center, const F32 radius) const 
@@ -463,65 +365,6 @@ F32 LLCamera::heightInPixels(const LLVector3 &center, F32 radius ) const
 	}
 }
 
-// If pos is visible, return the distance from pos to the camera.
-// Use fudge distance to scale rad against top/bot/left/right planes
-// Otherwise, return -distance
-F32 LLCamera::visibleDistance(const LLVector3 &pos, F32 rad, F32 fudgedist, U32 planemask) const
-{
-	if (mFixedDistance > 0)
-	{
-		return mFixedDistance;
-	}
-	LLVector3 dvec = pos - mOrigin;
-	// Check visibility
-	F32 dist = dvec.magVec();
-	if (dist > rad)
-	{
- 		F32 dp,tdist;
- 		dp = dvec * mXAxis;
-  		if (dp < -rad)
-  			return -dist;
-
-		rad *= fudgedist;
-		LLVector3 tvec(pos);
-		for (int p=0; p<PLANE_NUM; p++)
-		{
-			if (!(planemask & (1<<p)))
-				continue;
-			tdist = -(mWorldPlanes[p].dist(tvec));
-			if (tdist > rad)
-				return -dist;
-		}
-	}
-	return dist;
-}
-
-// Like visibleDistance, except uses mHorizPlanes[], which are left and right
-//  planes perpindicular to (0,0,1) in world space
-F32 LLCamera::visibleHorizDistance(const LLVector3 &pos, F32 rad, F32 fudgedist, U32 planemask) const
-{
-	if (mFixedDistance > 0)
-	{
-		return mFixedDistance;
-	}
-	LLVector3 dvec = pos - mOrigin;
-	// Check visibility
-	F32 dist = dvec.magVec();
-	if (dist > rad)
-	{
-		rad *= fudgedist;
-		LLVector3 tvec(pos);
-		for (int p=0; p<HORIZ_PLANE_NUM; p++)
-		{
-			if (!(planemask & (1<<p)))
-				continue;
-			F32 tdist = -(mHorizPlanes[p].dist(tvec));
-			if (tdist > rad)
-				return -dist;
-		}
-	}
-	return dist;
-}
 
 // ---------------- friends and operators ----------------  
 
@@ -536,18 +379,6 @@ std::ostream& operator<<(std::ostream &s, const LLCamera &C)
 	s << "  Aspect = " << C.getAspect() << "\n";
 	s << "  NearPlane   = " << C.mNearPlane << "\n";
 	s << "  FarPlane    = " << C.mFarPlane << "\n";
-	s << "  TopPlane    = " << C.mLocalPlanes[LLCamera::PLANE_TOP][VX] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_TOP][VY] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_TOP][VZ] << "\n";
-	s << "  BottomPlane = " << C.mLocalPlanes[LLCamera::PLANE_BOTTOM][VX] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_BOTTOM][VY] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_BOTTOM][VZ] << "\n";
-	s << "  LeftPlane   = " << C.mLocalPlanes[LLCamera::PLANE_LEFT][VX] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_LEFT][VY] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_LEFT][VZ] << "\n";
-	s << "  RightPlane  = " << C.mLocalPlanes[LLCamera::PLANE_RIGHT][VX] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_RIGHT][VY] << "  " 
-							<< C.mLocalPlanes[LLCamera::PLANE_RIGHT][VZ] << "\n";
 	s << "}";
 	return s;
 }
@@ -675,26 +506,6 @@ void LLCamera::calcRegionFrustumPlanes(const LLVector3& shift, F32 far_clip_dist
 
 void LLCamera::calculateFrustumPlanes(F32 left, F32 right, F32 top, F32 bottom)
 {
-	LLVector3 a, b, c;
-
-	// For each plane we need to define 3 points (LLVector3's) in camera view space.  
-	// The order in which we pass the points to planeFromPoints() matters, because the 
-	// plane normal has a degeneracy of 2; we want it pointing _into_ the frustum. 
-
-	a.setVec(0.0f, 0.0f, 0.0f);
-	b.setVec(mFarPlane, right, top);
-	c.setVec(mFarPlane, right, bottom);
-	mLocalPlanes[PLANE_RIGHT].setVec(a, b, c);
-
-	c.setVec(mFarPlane, left, top);
-	mLocalPlanes[PLANE_TOP].setVec(a, c, b);
-
-	b.setVec(mFarPlane, left, bottom);
-	mLocalPlanes[PLANE_LEFT].setVec(a, b, c);
-
-	c.setVec(mFarPlane, right, bottom);
-	mLocalPlanes[PLANE_BOTTOM].setVec( a, c, b); 
-
 	//calculate center and radius squared of frustum in world absolute coordinates
 	static LLVector3 const X_AXIS(1.f, 0.f, 0.f);
 	mFrustCenter = X_AXIS*mFarPlane*0.5f;
@@ -718,39 +529,6 @@ void LLCamera::calculateFrustumPlanesFromWindow(F32 x1, F32 y1, F32 x2, F32 y2)
 	calculateFrustumPlanes(left, right, top, bottom);
 }
 
-void LLCamera::calculateWorldFrustumPlanes() 
-{
-	F32 d;
-	LLVector3 center = mOrigin - mXAxis*mNearPlane;
-	mWorldPlanePos = center;
-	LLVector3 pnorm;	
-	for (int p = 0; p < PLANE_NUM; p++)
-	{
-		mLocalPlanes[p].getVector3(pnorm);
-		LLVector3 norm = rotateToAbsolute(pnorm);
-		norm.normVec();
-		d = -(center * norm);
-		mWorldPlanes[p] = LLPlane(norm, d);
-	}
-	// horizontal planes, perpindicular to (0,0,1);
-	LLVector3 zaxis(0, 0, 1.0f);
-	F32 yaw = getYaw();
-	{
-		LLVector3 tnorm;
-		mLocalPlanes[PLANE_LEFT].getVector3(tnorm);
-		tnorm.rotVec(yaw, zaxis);
-		d = -(mOrigin * tnorm);
-		mHorizPlanes[HORIZ_PLANE_LEFT] = LLPlane(tnorm, d);
-	}
-	{
-		LLVector3 tnorm;
-		mLocalPlanes[PLANE_RIGHT].getVector3(tnorm);
-		tnorm.rotVec(yaw, zaxis);
-		d = -(mOrigin * tnorm);
-		mHorizPlanes[HORIZ_PLANE_RIGHT] = LLPlane(tnorm, d);
-	}
-}
-
 // NOTE: this is the OpenGL matrix that will transform the default OpenGL view 
 // (-Z=at, Y=up) to the default view of the LLCamera class (X=at, Z=up):
 // 
diff --git a/indra/llmath/llcamera.h b/indra/llmath/llcamera.h
index d0afa0e88f..27eaa614c9 100644
--- a/indra/llmath/llcamera.h
+++ b/indra/llmath/llcamera.h
@@ -131,14 +131,10 @@ private:
 	S32 mViewHeightInPixels;	// for ViewHeightInPixels() only
 	F32 mNearPlane;
 	F32 mFarPlane;
-	LL_ALIGN_16(LLPlane mLocalPlanes[PLANE_NUM]);
 	F32 mFixedDistance;			// Always return this distance, unless < 0
 	LLVector3 mFrustCenter;		// center of frustum and radius squared for ultra-quick exclusion test
 	F32 mFrustRadiusSquared;
 	
-	LL_ALIGN_16(LLPlane mWorldPlanes[PLANE_NUM]);
-	LL_ALIGN_16(LLPlane mHorizPlanes[HORIZ_PLANE_NUM]);
-
 	U32 mPlaneCount;  //defaults to 6, if setUserClipPlane is called, uses user supplied clip plane in
 
 	LLVector3 mWorldPlanePos;		// Position of World Planes (may be offset from camera)
@@ -184,7 +180,6 @@ public:
 		return atan2f(mXAxis[VZ], xylen);
 	}
 
-	const LLPlane& getWorldPlane(S32 index) const	{ return mWorldPlanes[index]; }
 	const LLVector3& getWorldPlanePos() const		{ return mWorldPlanePos; }
 	
 	// Copy mView, mAspect, mNearPlane, and mFarPlane to buffer.
@@ -200,7 +195,6 @@ public:
 
 	// Returns 1 if partly in, 2 if fully in.
 	// NOTE: 'center' is in absolute frame.
-	S32 sphereInFrustumOld(const LLVector3 &center, const F32 radius) const;
 	S32 sphereInFrustum(const LLVector3 &center, const F32 radius) const;
 	S32 pointInFrustum(const LLVector3 &point) const { return sphereInFrustum(point, 0.0f); }
 	S32 sphereInFrustumFull(const LLVector3 &center, const F32 radius) const { return sphereInFrustum(center, radius); }
@@ -217,8 +211,6 @@ public:
 	F32 heightInPixels(const LLVector3 &center, F32 radius ) const;
 
 	// return the distance from pos to camera if visible (-distance if not visible)
-	F32 visibleDistance(const LLVector3 &pos, F32 rad, F32 fudgescale = 1.0f, U32 planemask = PLANE_ALL_MASK) const;
-	F32 visibleHorizDistance(const LLVector3 &pos, F32 rad, F32 fudgescale = 1.0f, U32 planemask = HORIZ_PLANE_ALL_MASK) const;
 	void setFixedDistance(F32 distance) { mFixedDistance = distance; }
 	
 	friend std::ostream& operator<<(std::ostream &s, const LLCamera &C);
@@ -227,7 +219,6 @@ protected:
 	void calculateFrustumPlanes();
 	void calculateFrustumPlanes(F32 left, F32 right, F32 top, F32 bottom);
 	void calculateFrustumPlanesFromWindow(F32 x1, F32 y1, F32 x2, F32 y2);
-	void calculateWorldFrustumPlanes();
 } LL_ALIGN_POSTFIX(16);
 
 
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index 40f7b1e9fb..91e463cc32 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -32,6 +32,7 @@
 #include <stdint.h>
 #endif
 #include <cmath>
+#include <unordered_map>
 
 #include "llerror.h"
 
@@ -52,6 +53,11 @@
 #include "llmeshoptimizer.h"
 #include "lltimer.h"
 
+#include "mikktspace/mikktspace.h"
+#include "mikktspace/mikktspace.c" // insert mikktspace implementation into llvolume object file
+
+#include "meshoptimizer/meshoptimizer.h"
+
 #define DEBUG_SILHOUETTE_BINORMALS 0
 #define DEBUG_SILHOUETTE_NORMALS 0 // TomY: Use this to display normals using the silhouette
 #define DEBUG_SILHOUETTE_EDGE_MAP 0 // DaveP: Use this to display edge map using the silhouette
@@ -2093,7 +2099,9 @@ void LLVolume::regen()
 
 void LLVolume::genTangents(S32 face)
 {
-	mVolumeFaces[face].createTangents();
+    // generate legacy tangents for the specified face
+    llassert(!isMeshAssetLoaded() || mVolumeFaces[face].mTangents != nullptr); // if this is a complete mesh asset, we should already have tangents
+    mVolumeFaces[face].createTangents();
 }
 
 LLVolume::~LLVolume()
@@ -2433,11 +2441,10 @@ bool LLVolume::unpackVolumeFacesInternal(const LLSD& mdl)
 
 			LLSD::Binary pos = mdl[i]["Position"];
 			LLSD::Binary norm = mdl[i]["Normal"];
+            LLSD::Binary tangent = mdl[i]["Tangent"];
 			LLSD::Binary tc = mdl[i]["TexCoord0"];
 			LLSD::Binary idx = mdl[i]["TriangleList"];
 
-			
-
 			//copy out indices
             S32 num_indices = idx.size() / 2;
             const S32 indices_to_discard = num_indices % 3;
@@ -2492,6 +2499,16 @@ bool LLVolume::unpackVolumeFacesInternal(const LLSD& mdl)
 			min_tc.setValue(mdl[i]["TexCoord0Domain"]["Min"]);
 			max_tc.setValue(mdl[i]["TexCoord0Domain"]["Max"]);
 
+            //unpack normalized scale/translation
+            if (mdl[i].has("NormalizedScale"))
+            {
+                face.mNormalizedScale.setValue(mdl[i]["NormalizedScale"]);
+            }
+            else
+            {
+                face.mNormalizedScale.set(1, 1, 1);
+            }
+            
 			LLVector4a pos_range;
 			pos_range.setSub(max_pos, min_pos);
 			LLVector2 tc_range2 = max_tc - min_tc;
@@ -2542,6 +2559,34 @@ bool LLVolume::unpackVolumeFacesInternal(const LLSD& mdl)
 				}
 			}
 
+#if 0 // keep this code for now in case we decide to add support for on-the-wire tangents
+            {
+                if (!tangent.empty())
+                {
+                    face.allocateTangents(face.mNumVertices);
+                    U16* t = (U16*)&(tangent[0]);
+
+                    // NOTE: tangents coming from the asset may not be mikkt space, but they should always be used by the GLTF shaders to 
+                    // maintain compliance with the GLTF spec
+                    LLVector4a* t_out = face.mTangents; 
+
+                    for (U32 j = 0; j < num_verts; ++j)
+                    {
+                        t_out->set((F32)t[0], (F32)t[1], (F32)t[2], (F32) t[3]);
+                        t_out->div(65535.f);
+                        t_out->mul(2.f);
+                        t_out->sub(1.f);
+
+                        F32* tp = t_out->getF32ptr();
+                        tp[3] = tp[3] < 0.f ? -1.f : 1.f;
+
+                        t_out++;
+                        t += 4;
+                    }
+                }
+            }
+#endif
+
 			{
 				if (!tc.empty())
 				{
@@ -2745,7 +2790,7 @@ bool LLVolume::unpackVolumeFacesInternal(const LLSD& mdl)
 		}
 	}
 
-	if (!cacheOptimize())
+	if (!cacheOptimize(true))
 	{
 		// Out of memory?
 		LL_WARNS() << "Failed to optimize!" << LL_ENDL;
@@ -2786,11 +2831,11 @@ void LLVolume::copyVolumeFaces(const LLVolume* volume)
 	mSculptLevel = 0;
 }
 
-bool LLVolume::cacheOptimize()
+bool LLVolume::cacheOptimize(bool gen_tangents)
 {
 	for (S32 i = 0; i < mVolumeFaces.size(); ++i)
 	{
-		if (!mVolumeFaces[i].cacheOptimize())
+		if (!mVolumeFaces[i].cacheOptimize(gen_tangents))
 		{
 			return false;
 		}
@@ -3306,12 +3351,12 @@ BOOL LLVolume::isFlat(S32 face)
 
 bool LLVolumeParams::isSculpt() const
 {
-	return mSculptID.notNull();
+    return (mSculptType & LL_SCULPT_TYPE_MASK) != LL_SCULPT_TYPE_NONE;
 }
 
 bool LLVolumeParams::isMeshSculpt() const
 {
-	return isSculpt() && ((mSculptType & LL_SCULPT_TYPE_MASK) == LL_SCULPT_TYPE_MESH);
+	return (mSculptType & LL_SCULPT_TYPE_MASK) == LL_SCULPT_TYPE_MESH;
 }
 
 bool LLVolumeParams::operator==(const LLVolumeParams &params) const
@@ -3726,6 +3771,7 @@ bool LLVolumeParams::validate(U8 prof_curve, F32 prof_begin, F32 prof_end, F32 h
 void LLVolume::getLoDTriangleCounts(const LLVolumeParams& params, S32* counts)
 { //attempt to approximate the number of triangles that will result from generating a volume LoD set for the 
 	//supplied LLVolumeParams -- inaccurate, but a close enough approximation for determining streaming cost
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VOLUME;
 	F32 detail[] = {1.f, 1.5f, 2.5f, 4.f};	
 	for (S32 i = 0; i < 4; i++)
 	{
@@ -4073,7 +4119,7 @@ S32 LLVolume::lineSegmentIntersect(const LLVector4a& start, const LLVector4a& en
 		{
 			if (tangent_out != NULL) // if the caller wants tangents, we may need to generate them
 			{
-				genTangents(i);
+                genTangents(i);
 			}
 
 			if (isUnique())
@@ -4861,6 +4907,7 @@ LLVolumeFace& LLVolumeFace::operator=(const LLVolumeFace& src)
     }
 
 	mOptimized = src.mOptimized;
+    mNormalizedScale = src.mNormalizedScale;
 
 	//delete 
 	return *this;
@@ -5383,256 +5430,218 @@ public:
 	}
 };
 
+// data structures for tangent generation
 
-bool LLVolumeFace::cacheOptimize()
-{ //optimize for vertex cache according to Forsyth method: 
-  // http://home.comcast.net/~tom_forsyth/papers/fast_vert_cache_opt.html
-	
-	llassert(!mOptimized);
-	mOptimized = TRUE;
+struct MikktData
+{
+    LLVolumeFace* face;
+    std::vector<LLVector3> p;
+    std::vector<LLVector3> n;
+    std::vector<LLVector2> tc;
+    std::vector<LLVector4> w;
+    std::vector<LLVector4> t;
 
-	LLVCacheLRU cache;
-	
-	if (mNumVertices < 3 || mNumIndices < 3)
-	{ //nothing to do
-		return true;
-	}
+    MikktData(LLVolumeFace* f)
+        : face(f)
+    {
+        U32 count = face->mNumIndices;
 
-	//mapping of vertices to triangles and indices
-	std::vector<LLVCacheVertexData> vertex_data;
+        p.resize(count);
+        n.resize(count);
+        tc.resize(count);
+        t.resize(count);
 
-	//mapping of triangles do vertices
-	std::vector<LLVCacheTriangleData> triangle_data;
+        if (face->mWeights)
+        {
+            w.resize(count);
+        }
 
-	try
-	{
-		triangle_data.resize(mNumIndices / 3);
-		vertex_data.resize(mNumVertices);
 
-        for (U32 i = 0; i < mNumIndices; i++)
-        { //populate vertex data and triangle data arrays
-            U16 idx = mIndices[i];
-            U32 tri_idx = i / 3;
+        LLVector3 inv_scale(1.f / face->mNormalizedScale.mV[0], 1.f / face->mNormalizedScale.mV[1], 1.f / face->mNormalizedScale.mV[2]);
+        
 
-            if (idx >= mNumVertices)
+        for (int i = 0; i < face->mNumIndices; ++i)
+        {
+            U32 idx = face->mIndices[i];
+
+            p[i].set(face->mPositions[idx].getF32ptr());
+            p[i].scaleVec(face->mNormalizedScale); //put mesh in original coordinate frame when reconstructing tangents
+            n[i].set(face->mNormals[idx].getF32ptr());
+            n[i].scaleVec(inv_scale);
+            n[i].normalize();
+            tc[i].set(face->mTexCoords[idx]);
+
+            if (idx >= face->mNumVertices)
             {
                 // invalid index
                 // replace with a valid index to avoid crashes
-                idx = mNumVertices - 1;
-                mIndices[i] = idx;
+                idx = face->mNumVertices - 1;
+                face->mIndices[i] = idx;
 
                 // Needs better logging
                 LL_DEBUGS_ONCE("LLVOLUME") << "Invalid index, substituting" << LL_ENDL;
             }
 
-            vertex_data[idx].mTriangles.push_back(&(triangle_data[tri_idx]));
-            vertex_data[idx].mIdx = idx;
-            triangle_data[tri_idx].mVertex[i % 3] = &(vertex_data[idx]);
+            if (face->mWeights)
+            {
+                w[i].set(face->mWeights[idx].getF32ptr());
+            }
         }
     }
-    catch (std::bad_alloc&)
-    {
-        // resize or push_back failed
-        LL_WARNS("LLVOLUME") << "Resize for " << mNumVertices << " vertices failed" << LL_ENDL;
-        return false;
-    }
+};
 
-	/*F32 pre_acmr = 1.f;
-	//measure cache misses from before rebuild
-	{
-		LLVCacheFIFO test_cache;
-		for (U32 i = 0; i < mNumIndices; ++i)
-		{
-			test_cache.addVertex(&vertex_data[mIndices[i]]);
-		}
 
-		for (U32 i = 0; i < mNumVertices; i++)
-		{
-			vertex_data[i].mCacheTag = -1;
-		}
+bool LLVolumeFace::cacheOptimize(bool gen_tangents)
+{ //optimize for vertex cache according to Forsyth method: 
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VOLUME;
+	llassert(!mOptimized);
+	mOptimized = TRUE;
 
-		pre_acmr = (F32) test_cache.mMisses/(mNumIndices/3);
-	}*/
+    if (gen_tangents && mNormals && mTexCoords)
+    { // generate mikkt space tangents before cache optimizing since the index buffer may change
+        // a bit of a hack to do this here, but this function gets called exactly once for the lifetime of a mesh
+        // and is executed on a background thread
+        SMikkTSpaceInterface ms;
 
-	for (U32 i = 0; i < mNumVertices; i++)
-	{ //initialize score values (no cache -- might try a fifo cache here)
-		LLVCacheVertexData& data = vertex_data[i];
+        ms.m_getNumFaces = [](const SMikkTSpaceContext* pContext)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            LLVolumeFace* face = data->face;
+            return face->mNumIndices / 3;
+        };
 
-		data.mScore = find_vertex_score(data);
-		data.mActiveTriangles = data.mTriangles.size();
+        ms.m_getNumVerticesOfFace = [](const SMikkTSpaceContext* pContext, const int iFace)
+        {
+            return 3;
+        };
 
-		for (U32 j = 0; j < data.mActiveTriangles; ++j)
-		{
-			data.mTriangles[j]->mScore += data.mScore;
-		}
-	}
+        ms.m_getPosition = [](const SMikkTSpaceContext* pContext, float fvPosOut[], const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            F32* v = data->p[iFace * 3 + iVert].mV;
+            fvPosOut[0] = v[0];
+            fvPosOut[1] = v[1];
+            fvPosOut[2] = v[2];
+        };
+
+        ms.m_getNormal = [](const SMikkTSpaceContext* pContext, float fvNormOut[], const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            F32* n = data->n[iFace * 3 + iVert].mV;
+            fvNormOut[0] = n[0];
+            fvNormOut[1] = n[1];
+            fvNormOut[2] = n[2];
+        };
+
+        ms.m_getTexCoord = [](const SMikkTSpaceContext* pContext, float fvTexcOut[], const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            F32* tc = data->tc[iFace * 3 + iVert].mV;
+            fvTexcOut[0] = tc[0];
+            fvTexcOut[1] = tc[1];
+        };
 
-	//sort triangle data by score
-	std::sort(triangle_data.begin(), triangle_data.end());
+        ms.m_setTSpaceBasic = [](const SMikkTSpaceContext* pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            S32 i = iFace * 3 + iVert;
+            
+            data->t[i].set(fvTangent);
+            data->t[i].mV[3] = fSign;
+        };
 
-	std::vector<U16> new_indices;
+        ms.m_setTSpace = nullptr;
 
-	LLVCacheTriangleData* tri;
+        MikktData data(this);
 
-	//prime pump by adding first triangle to cache;
-	tri = &(triangle_data[0]);
-	cache.addTriangle(tri);
-	new_indices.push_back(tri->mVertex[0]->mIdx);
-	new_indices.push_back(tri->mVertex[1]->mIdx);
-	new_indices.push_back(tri->mVertex[2]->mIdx);
-	tri->complete();
+        SMikkTSpaceContext ctx = { &ms, &data };
 
-	U32 breaks = 0;
-	for (U32 i = 1; i < mNumIndices/3; ++i)
-	{
-		cache.updateScores();
-		tri = cache.mBestTriangle;
-		if (!tri)
-		{
-			breaks++;
-			for (U32 j = 0; j < triangle_data.size(); ++j)
-			{
-				if (triangle_data[j].mActive)
-				{
-					tri = &(triangle_data[j]);
-					break;
-				}
-			}
-		}	
-		
-		cache.addTriangle(tri);
-		new_indices.push_back(tri->mVertex[0]->mIdx);
-		new_indices.push_back(tri->mVertex[1]->mIdx);
-		new_indices.push_back(tri->mVertex[2]->mIdx);
-		tri->complete();
-	}
+        genTangSpaceDefault(&ctx);
 
-	for (U32 i = 0; i < mNumIndices; ++i)
-	{
-		mIndices[i] = new_indices[i];
-	}
+        //re-weld
+        meshopt_Stream mos[] =
+        {
+            { &data.p[0], sizeof(LLVector3), sizeof(LLVector3) },
+            { &data.n[0], sizeof(LLVector3), sizeof(LLVector3) },
+            { &data.t[0], sizeof(LLVector4), sizeof(LLVector4) },
+            { &data.tc[0], sizeof(LLVector2), sizeof(LLVector2) },
+            { data.w.empty() ? nullptr : &data.w[0], sizeof(LLVector4), sizeof(LLVector4) }
+        };
 
-	/*F32 post_acmr = 1.f;
-	//measure cache misses from after rebuild
-	{
-		LLVCacheFIFO test_cache;
-		for (U32 i = 0; i < mNumVertices; i++)
-		{
-			vertex_data[i].mCacheTag = -1;
-		}
+        std::vector<U32> remap;
+        remap.resize(data.p.size());
 
-		for (U32 i = 0; i < mNumIndices; ++i)
-		{
-			test_cache.addVertex(&vertex_data[mIndices[i]]);
-		}
-		
-		post_acmr = (F32) test_cache.mMisses/(mNumIndices/3);
-	}*/
+        U32 stream_count = data.w.empty() ? 4 : 5;
 
-	//optimize for pre-TnL cache
-	
-	//allocate space for new buffer
-	S32 num_verts = mNumVertices;
-	S32 size = ((num_verts*sizeof(LLVector2)) + 0xF) & ~0xF;
-	LLVector4a* pos = (LLVector4a*) ll_aligned_malloc<64>(sizeof(LLVector4a)*2*num_verts+size);
-	if (pos == NULL)
-	{
-		LL_WARNS("LLVOLUME") << "Allocation of positions vector[" << sizeof(LLVector4a) * 2 * num_verts + size  << "] failed. " << LL_ENDL;
-		return false;
-	}
-	LLVector4a* norm = pos + num_verts;
-	LLVector2* tc = (LLVector2*) (norm + num_verts);
+        U32 vert_count = meshopt_generateVertexRemapMulti(&remap[0], nullptr, data.p.size(), data.p.size(), mos, stream_count);
 
-	LLVector4a* wght = NULL;
-	if (mWeights)
-	{
-		wght = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		if (wght == NULL)
-		{
-			ll_aligned_free<64>(pos);
-			LL_WARNS("LLVOLUME") << "Allocation of weights[" << sizeof(LLVector4a) * num_verts << "] failed" << LL_ENDL;
-			return false;
-		}
-	}
+        if (vert_count < 65535)
+        {
+            std::vector<U32> indices;
+            indices.resize(mNumIndices);
 
-	LLVector4a* binorm = NULL;
-	if (mTangents)
-	{
-		binorm = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		if (binorm == NULL)
-		{
-			ll_aligned_free<64>(pos);
-			ll_aligned_free_16(wght);
-			LL_WARNS("LLVOLUME") << "Allocation of binormals[" << sizeof(LLVector4a)*num_verts << "] failed" << LL_ENDL;
-			return false;
-		}
-	}
+            //copy results back into volume
+            resizeVertices(vert_count);
 
-	//allocate mapping of old indices to new indices
-	std::vector<S32> new_idx;
+            if (!data.w.empty())
+            {
+                allocateWeights(vert_count);
+            }
 
-	try
-	{
-		new_idx.resize(mNumVertices, -1);
-	}
-	catch (std::bad_alloc&)
-	{
-		ll_aligned_free<64>(pos);
-		ll_aligned_free_16(wght);
-		ll_aligned_free_16(binorm);
-		LL_WARNS("LLVOLUME") << "Resize failed: " << mNumVertices << LL_ENDL;
-		return false;
-	}
+            allocateTangents(mNumVertices);
 
-	S32 cur_idx = 0;
-	for (U32 i = 0; i < mNumIndices; ++i)
-	{
-		U16 idx = mIndices[i];
-		if (new_idx[idx] == -1)
-		{ //this vertex hasn't been added yet
-			new_idx[idx] = cur_idx;
+            for (int i = 0; i < mNumIndices; ++i)
+            {
+                U32 src_idx = i;
+                U32 dst_idx = remap[i];
+                mIndices[i] = dst_idx;
 
-			//copy vertex data
-			pos[cur_idx] = mPositions[idx];
-			norm[cur_idx] = mNormals[idx];
-			tc[cur_idx] = mTexCoords[idx];
-			if (mWeights)
-			{
-				wght[cur_idx] = mWeights[idx];
-			}
-			if (mTangents)
-			{
-				binorm[cur_idx] = mTangents[idx];
-			}
+                mPositions[dst_idx].load3(data.p[src_idx].mV);
+                mNormals[dst_idx].load3(data.n[src_idx].mV);
+                mTexCoords[dst_idx] = data.tc[src_idx];
 
-			cur_idx++;
-		}
-	}
+                mTangents[dst_idx].loadua(data.t[src_idx].mV);
 
-	for (U32 i = 0; i < mNumIndices; ++i)
-	{
-		mIndices[i] = new_idx[mIndices[i]];
-	}
-	
-	ll_aligned_free<64>(mPositions);
-	// DO NOT free mNormals and mTexCoords as they are part of mPositions buffer
-	ll_aligned_free_16(mWeights);
-	ll_aligned_free_16(mTangents);
-#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
-    ll_aligned_free_16(mJointIndices);
-    ll_aligned_free_16(mJustWeights);
-    mJustWeights = NULL;
-    mJointIndices = NULL; // filled in later as necessary by skinning code for acceleration
-#endif
+                if (mWeights)
+                {
+                    mWeights[dst_idx].loadua(data.w[src_idx].mV);
+                }
+            }
+        }
+        else
+        {
+            // blew past the max vertex size limit, use legacy tangent generation which never adds verts
+            createTangents();
+        }
 
-	mPositions = pos;
-	mNormals = norm;
-	mTexCoords = tc;
-	mWeights = wght;    
-	mTangents = binorm;
+        // put back in normalized coordinate frame
+        LLVector4a inv_scale(1.f/mNormalizedScale.mV[0], 1.f / mNormalizedScale.mV[1], 1.f / mNormalizedScale.mV[2]);
+        LLVector4a scale;
+        scale.load3(mNormalizedScale.mV);
+        scale.getF32ptr()[3] = 1.f;
+
+        for (int i = 0; i < mNumVertices; ++i)
+        {
+            mPositions[i].mul(inv_scale);
+            mNormals[i].mul(scale);
+            mNormals[i].normalize3();
+            F32 w = mTangents[i].getF32ptr()[3];
+            mTangents[i].mul(scale);
+            mTangents[i].normalize3();
+            mTangents[i].getF32ptr()[3] = w;
+        }
+    }
 
-	//std::string result = llformat("ACMR pre/post: %.3f/%.3f  --  %d triangles %d breaks", pre_acmr, post_acmr, mNumIndices/3, breaks);
-	//LL_INFOS() << result << LL_ENDL;
+    // cache optimize index buffer
+
+    // meshopt needs scratch space, do some pointer shuffling to avoid an extra index buffer copy
+    U16* src_indices = mIndices;
+    mIndices = nullptr;
+    resizeIndices(mNumIndices);
+
+    meshopt_optimizeVertexCache<U16>(mIndices, src_indices, mNumIndices, mNumVertices);
+    
+    ll_aligned_free_16(src_indices);
 
 	return true;
 }
@@ -6442,35 +6451,31 @@ void CalculateTangentArray(U32 vertexCount, const LLVector4a *vertex, const LLVe
 
 void LLVolumeFace::createTangents()
 {
-	LL_PROFILE_ZONE_SCOPED_CATEGORY_VOLUME
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VOLUME;
 
-	if (!mTangents)
-	{
-		allocateTangents(mNumVertices);
-
-		//generate tangents
-		//LLVector4a* pos = mPositions;
-		//LLVector2* tc = (LLVector2*) mTexCoords;
-		LLVector4a* binorm = (LLVector4a*) mTangents;
+    if (!mTangents)
+    {
+        allocateTangents(mNumVertices);
+        
+        //generate tangents
+        LLVector4a* ptr = (LLVector4a*)mTangents;
 
-		LLVector4a* end = mTangents+mNumVertices;
-		while (binorm < end)
-		{
-			(*binorm++).clear();
-		}
+        LLVector4a* end = mTangents + mNumVertices;
+        while (ptr < end)
+        {
+            (*ptr++).clear();
+        }
 
-		binorm = mTangents;
+        CalculateTangentArray(mNumVertices, mPositions, mNormals, mTexCoords, mNumIndices / 3, mIndices, mTangents);
 
-		CalculateTangentArray(mNumVertices, mPositions, mNormals, mTexCoords, mNumIndices/3, mIndices, mTangents);
+        //normalize normals
+        for (U32 i = 0; i < mNumVertices; i++)
+        {
+            //bump map/planar projection code requires normals to be normalized
+            mNormals[i].normalize3fast();
+        }
+    }
 
-		//normalize tangents
-		for (U32 i = 0; i < mNumVertices; i++) 
-		{
-			//binorm[i].normalize3fast();
-			//bump map/planar projection code requires normals to be normalized
-			mNormals[i].normalize3fast();
-		}
-	}
 }
 
 void LLVolumeFace::resizeVertices(S32 num_verts)
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index 1509241623..ad6a669531 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -908,7 +908,7 @@ public:
     void remap();
 
 	void optimize(F32 angle_cutoff = 2.f);
-	bool cacheOptimize();
+	bool cacheOptimize(bool gen_tangents = false);
 
 	void createOctree(F32 scaler = 0.25f, const LLVector4a& center = LLVector4a(0,0,0), const LLVector4a& size = LLVector4a(0.5f,0.5f,0.5f));
     void destroyOctree();
@@ -960,10 +960,6 @@ public:
     // indexes for mPositions/mNormals/mTexCoords
 	U16* mIndices;
 
-	// vertex buffer filled in by LLFace to cache this volume face geometry in vram 
-	// (declared as a LLPointer to LLRefCount to avoid dependency on LLVertexBuffer)
-	mutable LLPointer<LLRefCount> mVertexBuffer; 
-
 	std::vector<S32>	mEdge;
 
 	//list of skin weights for rigged volumes
@@ -985,6 +981,11 @@ public:
 	//whether or not face has been cache optimized
 	BOOL mOptimized;
 
+    // if this is a mesh asset, scale and translation that were applied
+    // when encoding the source mesh into a unit cube
+    // used for regenerating tangents
+    LLVector3 mNormalizedScale = LLVector3(1,1,1);
+
 private:
     LLOctreeNode<LLVolumeTriangle, LLVolumeTriangle*>* mOctree;
     LLVolumeTriangle* mOctreeTriangles;
@@ -1033,7 +1034,7 @@ public:
 	void setDirty() { mPathp->setDirty(); mProfilep->setDirty(); }
 
 	void regen();
-	void genTangents(S32 face);
+    void genTangents(S32 face);
 
 	BOOL isConvex() const;
 	BOOL isCap(S32 face);
@@ -1087,7 +1088,10 @@ public:
 	void copyVolumeFaces(const LLVolume* volume);
 	void copyFacesTo(std::vector<LLVolumeFace> &faces) const;
 	void copyFacesFrom(const std::vector<LLVolumeFace> &faces);
-	bool cacheOptimize();
+
+    // use meshoptimizer to optimize index buffer for vertex shader cache
+    //  gen_tangents - if true, generate MikkTSpace tangents if needed before optimizing index buffer
+	bool cacheOptimize(bool gen_tangents = false);
 
 private:
 	void sculptGenerateMapVertices(U16 sculpt_width, U16 sculpt_height, S8 sculpt_components, const U8* sculpt_data, U8 sculpt_type);
diff --git a/indra/llmath/v3color.h b/indra/llmath/v3color.h
index 43a632408c..d925f56e97 100644
--- a/indra/llmath/v3color.h
+++ b/indra/llmath/v3color.h
@@ -33,6 +33,7 @@ class LLVector4;
 #include "llerror.h"
 #include "llmath.h"
 #include "llsd.h"
+#include "v3math.h"  // needed for linearColor3v implemtation below
 #include <string.h>
 
 //  LLColor3 = |r g b|
@@ -87,6 +88,16 @@ public:
 	const LLColor3&	set(F32 x, F32 y, F32 z);	// Sets LLColor3 to (x, y, z)
 	const LLColor3&	set(const LLColor3 &vec);	// Sets LLColor3 to vec
 	const LLColor3&	set(const F32 *vec);		// Sets LLColor3 to vec
+    
+    // set from a vector of unknown type and size
+    // may leave some data unmodified
+    template<typename T>
+    const LLColor3& set(const std::vector<T>& v);
+
+    // write to a vector of unknown type and size
+    // maye leave some data unmodified
+    template<typename T>
+    void write(std::vector<T>& v) const;
 
 	F32		magVec() const;				// deprecated
 	F32		magVecSquared() const;		// deprecated
@@ -484,13 +495,45 @@ inline const LLColor3 srgbColor3(const LLColor3 &a) {
 	return srgbColor;
 }
 
-inline const LLColor3 linearColor3(const LLColor3 &a) {
+inline const LLColor3 linearColor3p(const F32* v) {
     LLColor3 linearColor;
-    linearColor.mV[0] = sRGBtoLinear(a.mV[0]);
-    linearColor.mV[1] = sRGBtoLinear(a.mV[1]);
-    linearColor.mV[2] = sRGBtoLinear(a.mV[2]);
+    linearColor.mV[0] = sRGBtoLinear(v[0]);
+    linearColor.mV[1] = sRGBtoLinear(v[1]);
+    linearColor.mV[2] = sRGBtoLinear(v[2]);
 
     return linearColor;
 }
 
+template<class T>
+inline const LLColor3 linearColor3(const T& a) {
+    return linearColor3p(a.mV);
+}
+
+template<class T>
+inline const LLVector3 linearColor3v(const T& a) {
+    return LLVector3(linearColor3p(a.mV).mV);
+}
+
+template<typename T>
+const LLColor3& LLColor3::set(const std::vector<T>& v)
+{
+    for (S32 i = 0; i < llmin((S32)v.size(), 3); ++i)
+    {
+        mV[i] = v[i];
+    }
+
+    return *this;
+}
+
+// write to a vector of unknown type and size
+// maye leave some data unmodified
+template<typename T>
+void LLColor3::write(std::vector<T>& v) const
+{
+    for (int i = 0; i < llmin((S32)v.size(), 3); ++i)
+    {
+        v[i] = mV[i];
+    }
+}
+
 #endif
diff --git a/indra/llmath/v4color.h b/indra/llmath/v4color.h
index 175edf1471..daa61594fb 100644
--- a/indra/llmath/v4color.h
+++ b/indra/llmath/v4color.h
@@ -88,8 +88,18 @@ class LLColor4
 		const LLColor4&	set(const LLColor3 &vec);	// Sets LLColor4 to LLColor3 vec (no change in alpha)
 		const LLColor4&	set(const LLColor3 &vec, F32 a);	// Sets LLColor4 to LLColor3 vec, with alpha specified
 		const LLColor4&	set(const F32 *vec);			// Sets LLColor4 to vec
-		const LLColor4&	set(const LLColor4U& color4u); // Sets LLColor4 to color4u, rescaled.
+        const LLColor4&	set(const F64 *vec);			// Sets LLColor4 to (double)vec
+        const LLColor4&	set(const LLColor4U& color4u); // Sets LLColor4 to color4u, rescaled.
 
+        // set from a vector of unknown type and size
+        // may leave some data unmodified
+        template<typename T> 
+        const LLColor4& set(const std::vector<T>& v);
+
+        // write to a vector of unknown type and size
+        // maye leave some data unmodified
+        template<typename T>
+        void write(std::vector<T>& v) const;
 
 		const LLColor4&    setAlpha(F32 a);
 
@@ -334,6 +344,15 @@ inline const LLColor4&	LLColor4::set(const F32 *vec)
 	return (*this);
 }
 
+inline const LLColor4&	LLColor4::set(const F64 *vec)
+{
+    mV[VX] = static_cast<F32>(vec[VX]);
+    mV[VY] = static_cast<F32>(vec[VY]);
+    mV[VZ] = static_cast<F32>(vec[VZ]);
+    mV[VW] = static_cast<F32>(vec[VW]);
+    return (*this);
+}
+
 // deprecated
 inline const LLColor4&	LLColor4::setVec(F32 x, F32 y, F32 z)
 {
@@ -680,5 +699,25 @@ inline const LLColor4 linearColor4(const LLColor4 &a)
     return linearColor;
 }
 
+template<typename T>
+const LLColor4& LLColor4::set(const std::vector<T>& v)
+{
+    for (S32 i = 0; i < llmin((S32)v.size(), 4); ++i)
+    {
+        mV[i] = v[i];
+    }
+
+    return *this;
+}
+
+template<typename T>
+void LLColor4::write(std::vector<T>& v) const
+{
+    for (int i = 0; i < llmin((S32)v.size(), 4); ++i)
+    {
+        v[i] = mV[i];
+    }
+}
+
 #endif