10 files changed, 304 insertions, 555 deletions
diff --git a/indra/cmake/LLMath.cmake b/indra/cmake/LLMath.cmake
index 3cbb7ad561..513ff9f81d 100644
--- a/indra/cmake/LLMath.cmake
+++ b/indra/cmake/LLMath.cmake
@@ -2,6 +2,7 @@
 
 include(Variables)
 include(Mikktspace)
+include(MESHOPTIMIZER)
 
 set(LLMATH_INCLUDE_DIRS
     ${LIBS_OPEN_DIR}/llmath
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index 539db9d0e1..563a325f03 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -56,6 +56,8 @@
 #include "mikktspace/mikktspace.h"
 #include "mikktspace/mikktspace.c" // insert mikktspace implementation into llvolume object file
 
+#include "meshoptimizer/meshoptimizer.h"
+
 #define DEBUG_SILHOUETTE_BINORMALS 0
 #define DEBUG_SILHOUETTE_NORMALS 0 // TomY: Use this to display normals using the silhouette
 #define DEBUG_SILHOUETTE_EDGE_MAP 0 // DaveP: Use this to display edge map using the silhouette
@@ -2102,7 +2104,12 @@ void LLVolume::regen()
 
 void LLVolume::genTangents(S32 face, bool mikktspace)
 {
-	mVolumeFaces[face].createTangents(mikktspace);
+    // generate legacy tangents for the specified face
+    // if mikktspace is true, only generate tangents if mikktspace tangents are not present (handles the case for non-mesh prims)
+    if (!mikktspace || mVolumeFaces[face].mMikktSpaceTangents == nullptr)
+    {
+        mVolumeFaces[face].createTangents();
+    }
 }
 
 LLVolume::~LLVolume()
@@ -2424,11 +2431,10 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 
 			LLSD::Binary pos = mdl[i]["Position"];
 			LLSD::Binary norm = mdl[i]["Normal"];
+            LLSD::Binary tangent = mdl[i]["Tangent"];
 			LLSD::Binary tc = mdl[i]["TexCoord0"];
 			LLSD::Binary idx = mdl[i]["TriangleList"];
 
-			
-
 			//copy out indices
             S32 num_indices = idx.size() / 2;
             face.resizeIndices(num_indices);
@@ -2527,6 +2533,33 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 				}
 			}
 
+            {
+                if (!tangent.empty())
+                {
+                    face.allocateTangents(face.mNumVertices, true);
+                    U16* t = (U16*)&(tangent[0]);
+
+                    // store incoming tangents in mMikktSpaceTangents
+                    // NOTE: tangents coming from the asset may not be mikkt space, but they should always be used by the CLTF shaders to 
+                    // maintain compliance with the GLTF spec
+                    LLVector4a* t_out = face.mMikktSpaceTangents; 
+
+                    for (U32 j = 0; j < num_verts; ++j)
+                    {
+                        t_out->set((F32)t[0], (F32)t[1], (F32)t[2], (F32) t[3]);
+                        t_out->div(65535.f);
+                        t_out->mul(2.f);
+                        t_out->sub(1.f);
+
+                        F32* tp = t_out->getF32ptr();
+                        tp[3] = tp[3] < 0.f ? -1.f : 1.f;
+
+                        t_out++;
+                        t += 4;
+                    }
+                }
+            }
+
 			{
 				if (!tc.empty())
 				{
@@ -5373,251 +5406,197 @@ public:
 	}
 };
 
+// data structures for tangent generation
 
-bool LLVolumeFace::cacheOptimize()
-{ //optimize for vertex cache according to Forsyth method: 
-  // http://home.comcast.net/~tom_forsyth/papers/fast_vert_cache_opt.html
-	
-	llassert(!mOptimized);
-	mOptimized = TRUE;
-
-	LLVCacheLRU cache;
-	
-	if (mNumVertices < 3 || mNumIndices < 3)
-	{ //nothing to do
-		return true;
-	}
+struct MikktData
+{
+    LLVolumeFace* face;
+    std::vector<LLVector3> p;
+    std::vector<LLVector3> n;
+    std::vector<LLVector2> tc;
+    std::vector<LLVector4> w;
+    std::vector<LLVector4> t;
+
+    MikktData(LLVolumeFace* f)
+        : face(f)
+    {
+        U32 count = face->mNumIndices;
 
-	//mapping of vertices to triangles and indices
-	std::vector<LLVCacheVertexData> vertex_data;
+        p.resize(count);
+        n.resize(count);
+        tc.resize(count);
+        t.resize(count);
 
-	//mapping of triangles do vertices
-	std::vector<LLVCacheTriangleData> triangle_data;
+        if (face->mWeights)
+        {
+            w.resize(count);
+        }
 
-	try
-	{
-		triangle_data.resize(mNumIndices / 3);
-		vertex_data.resize(mNumVertices);
+        for (int i = 0; i < face->mNumIndices; ++i)
+        {
+            U32 idx = face->mIndices[i];
 
-        for (U32 i = 0; i < mNumIndices; i++)
-        { //populate vertex data and triangle data arrays
-            U16 idx = mIndices[i];
-            U32 tri_idx = i / 3;
+            p[i].set(face->mPositions[idx].getF32ptr());
+            n[i].set(face->mNormals[idx].getF32ptr());
+            tc[i].set(face->mTexCoords[idx]);
 
-            vertex_data[idx].mTriangles.push_back(&(triangle_data[tri_idx]));
-            vertex_data[idx].mIdx = idx;
-            triangle_data[tri_idx].mVertex[i % 3] = &(vertex_data[idx]);
+            if (face->mWeights)
+            {
+                w[i].set(face->mWeights[idx].getF32ptr());
+            }
         }
     }
-    catch (std::bad_alloc&)
-    {
-        // resize or push_back failed
-        LL_WARNS("LLVOLUME") << "Resize for " << mNumVertices << " vertices failed" << LL_ENDL;
+};
+
+
+bool LLVolumeFace::cacheOptimize()
+{ //optimize for vertex cache according to Forsyth method: 
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VOLUME;
+	llassert(!mOptimized);
+	mOptimized = TRUE;
+
+    if (!mNormals || !mTexCoords)
+    { // can't perform this operation without normals and texture coordinates
         return false;
     }
 
-	/*F32 pre_acmr = 1.f;
-	//measure cache misses from before rebuild
-	{
-		LLVCacheFIFO test_cache;
-		for (U32 i = 0; i < mNumIndices; ++i)
-		{
-			test_cache.addVertex(&vertex_data[mIndices[i]]);
-		}
+    if (mMikktSpaceTangents == nullptr)
+    { // make sure to generate mikkt space tangents for cache optimizing since the index buffer may change
+        allocateTangents(mNumVertices, true);
 
-		for (U32 i = 0; i < mNumVertices; i++)
-		{
-			vertex_data[i].mCacheTag = -1;
-		}
+        SMikkTSpaceInterface ms;
 
-		pre_acmr = (F32) test_cache.mMisses/(mNumIndices/3);
-	}*/
+        ms.m_getNumFaces = [](const SMikkTSpaceContext* pContext)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            LLVolumeFace* face = data->face;
+            return face->mNumIndices / 3;
+        };
 
-	for (U32 i = 0; i < mNumVertices; i++)
-	{ //initialize score values (no cache -- might try a fifo cache here)
-		LLVCacheVertexData& data = vertex_data[i];
+        ms.m_getNumVerticesOfFace = [](const SMikkTSpaceContext* pContext, const int iFace)
+        {
+            return 3;
+        };
 
-		data.mScore = find_vertex_score(data);
-		data.mActiveTriangles = data.mTriangles.size();
+        ms.m_getPosition = [](const SMikkTSpaceContext* pContext, float fvPosOut[], const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            LLVolumeFace* face = data->face;
+            S32 idx = face->mIndices[iFace * 3 + iVert];
+            auto& vert = face->mPositions[idx];
+            F32* v = vert.getF32ptr();
+            fvPosOut[0] = v[0];
+            fvPosOut[1] = v[1];
+            fvPosOut[2] = v[2];
+        };
+
+        ms.m_getNormal = [](const SMikkTSpaceContext* pContext, float fvNormOut[], const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            LLVolumeFace* face = data->face;
+            S32 idx = face->mIndices[iFace * 3 + iVert];
+            auto& norm = face->mNormals[idx];
+            F32* n = norm.getF32ptr();
+            fvNormOut[0] = n[0];
+            fvNormOut[1] = n[1];
+            fvNormOut[2] = n[2];
+        };
+
+        ms.m_getTexCoord = [](const SMikkTSpaceContext* pContext, float fvTexcOut[], const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            LLVolumeFace* face = data->face;
+            S32 idx = face->mIndices[iFace * 3 + iVert];
+            auto& tc = face->mTexCoords[idx];
+            fvTexcOut[0] = tc.mV[0];
+            fvTexcOut[1] = tc.mV[1];
+        };
+
+        ms.m_setTSpaceBasic = [](const SMikkTSpaceContext* pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert)
+        {
+            MikktData* data = (MikktData*)pContext->m_pUserData;
+            LLVolumeFace* face = data->face;
+            S32 i = iFace * 3 + iVert;
+            S32 idx = face->mIndices[i];
 
-		for (U32 j = 0; j < data.mActiveTriangles; ++j)
-		{
-			data.mTriangles[j]->mScore += data.mScore;
-		}
-	}
+            LLVector3 p(face->mPositions[idx].getF32ptr());
+            LLVector3 n(face->mNormals[idx].getF32ptr());
+            LLVector3 t(fvTangent);
 
-	//sort triangle data by score
-	std::sort(triangle_data.begin(), triangle_data.end());
+            // assert that this tangent hasn't already been set
+            llassert(data->t[i].magVec() < 0.1f);
 
-	std::vector<U16> new_indices;
+            data->t[i].set(fvTangent);
+            data->t[i].mV[3] = fSign;
+        };
 
-	LLVCacheTriangleData* tri;
+        ms.m_setTSpace = nullptr;
 
-	//prime pump by adding first triangle to cache;
-	tri = &(triangle_data[0]);
-	cache.addTriangle(tri);
-	new_indices.push_back(tri->mVertex[0]->mIdx);
-	new_indices.push_back(tri->mVertex[1]->mIdx);
-	new_indices.push_back(tri->mVertex[2]->mIdx);
-	tri->complete();
+        MikktData data(this);
 
-	U32 breaks = 0;
-	for (U32 i = 1; i < mNumIndices/3; ++i)
-	{
-		cache.updateScores();
-		tri = cache.mBestTriangle;
-		if (!tri)
-		{
-			breaks++;
-			for (U32 j = 0; j < triangle_data.size(); ++j)
-			{
-				if (triangle_data[j].mActive)
-				{
-					tri = &(triangle_data[j]);
-					break;
-				}
-			}
-		}	
-		
-		cache.addTriangle(tri);
-		new_indices.push_back(tri->mVertex[0]->mIdx);
-		new_indices.push_back(tri->mVertex[1]->mIdx);
-		new_indices.push_back(tri->mVertex[2]->mIdx);
-		tri->complete();
-	}
+        SMikkTSpaceContext ctx = { &ms, &data };
 
-	for (U32 i = 0; i < mNumIndices; ++i)
-	{
-		mIndices[i] = new_indices[i];
-	}
+        genTangSpaceDefault(&ctx);
 
-	/*F32 post_acmr = 1.f;
-	//measure cache misses from after rebuild
-	{
-		LLVCacheFIFO test_cache;
-		for (U32 i = 0; i < mNumVertices; i++)
-		{
-			vertex_data[i].mCacheTag = -1;
-		}
+        //re-weld
+        meshopt_Stream mos[] =
+        {
+            { &data.p[0], sizeof(LLVector3), sizeof(LLVector3) },
+            { &data.n[0], sizeof(LLVector3), sizeof(LLVector3) },
+            { &data.t[0], sizeof(LLVector4), sizeof(LLVector4) },
+            { &data.tc[0], sizeof(LLVector2), sizeof(LLVector2) },
+            { data.w.empty() ? nullptr : &data.w[0], sizeof(LLVector4), sizeof(LLVector4) }
+        };
 
-		for (U32 i = 0; i < mNumIndices; ++i)
-		{
-			test_cache.addVertex(&vertex_data[mIndices[i]]);
-		}
-		
-		post_acmr = (F32) test_cache.mMisses/(mNumIndices/3);
-	}*/
+        std::vector<U32> remap;
+        remap.resize(data.p.size());
 
-	//optimize for pre-TnL cache
-	
-	//allocate space for new buffer
-	S32 num_verts = mNumVertices;
-	S32 size = ((num_verts*sizeof(LLVector2)) + 0xF) & ~0xF;
-	LLVector4a* pos = (LLVector4a*) ll_aligned_malloc<64>(sizeof(LLVector4a)*2*num_verts+size);
-	if (pos == NULL)
-	{
-		LL_WARNS("LLVOLUME") << "Allocation of positions vector[" << sizeof(LLVector4a) * 2 * num_verts + size  << "] failed. " << LL_ENDL;
-		return false;
-	}
-	LLVector4a* norm = pos + num_verts;
-	LLVector2* tc = (LLVector2*) (norm + num_verts);
+        U32 stream_count = data.w.empty() ? 4 : 5;
 
-	LLVector4a* wght = NULL;
-	if (mWeights)
-	{
-		wght = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		if (wght == NULL)
-		{
-			ll_aligned_free<64>(pos);
-			LL_WARNS("LLVOLUME") << "Allocation of weights[" << sizeof(LLVector4a) * num_verts << "] failed" << LL_ENDL;
-			return false;
-		}
-	}
+        U32 vert_count = meshopt_generateVertexRemapMulti(&remap[0], nullptr, data.p.size(), data.p.size(), mos, stream_count);
 
-    llassert(mTangents == nullptr); // cache optimize called too late, tangents already generated
-    llassert(mMikktSpaceTangents == nullptr);
+        std::vector<U32> indices;
+        indices.resize(mNumIndices);
 
-    // =====================================================================================
-    // DEPRECATED -- cacheOptimize should always be called before tangents are generated
-    // =====================================================================================
-	LLVector4a* binorm = NULL;
-	if (mTangents)
-	{
-		binorm = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		if (binorm == NULL)
-		{
-			ll_aligned_free<64>(pos);
-			ll_aligned_free_16(wght);
-			LL_WARNS("LLVOLUME") << "Allocation of binormals[" << sizeof(LLVector4a)*num_verts << "] failed" << LL_ENDL;
-			return false;
-		}
-	}
-    // =====================================================================================
+        //copy results back into volume
+        resizeVertices(vert_count);
 
-    //allocate mapping of old indices to new indices
-	std::vector<S32> new_idx;
-    try
-	{
-		new_idx.resize(mNumVertices, -1);
-	}
-	catch (std::bad_alloc&)
-	{
-		ll_aligned_free<64>(pos);
-		ll_aligned_free_16(wght);
-		ll_aligned_free_16(binorm);
-		LL_WARNS("LLVOLUME") << "Resize failed: " << mNumVertices << LL_ENDL;
-		return false;
-	}
+        if (!data.w.empty())
+        {
+            allocateWeights(vert_count);
+        }
 
-	S32 cur_idx = 0;
-	for (U32 i = 0; i < mNumIndices; ++i)
-	{
-		U16 idx = mIndices[i];
-		if (new_idx[idx] == -1)
-		{ //this vertex hasn't been added yet
-			new_idx[idx] = cur_idx;
+        allocateTangents(mNumVertices, true);
 
-			//copy vertex data
-			pos[cur_idx] = mPositions[idx];
-			norm[cur_idx] = mNormals[idx];
-			tc[cur_idx] = mTexCoords[idx];
-			if (mWeights)
-			{
-				wght[cur_idx] = mWeights[idx];
-			}
-			if (mTangents)
-			{
-				binorm[cur_idx] = mTangents[idx];
-			}
+        for (int i = 0; i < mNumIndices; ++i)
+        {
+            U32 src_idx = i;
+            U32 dst_idx = remap[i];
+            mIndices[i] = dst_idx;
 
-			cur_idx++;
-		}
-	}
+            mPositions[dst_idx].load3(data.p[src_idx].mV);
+            mNormals[dst_idx].load3(data.n[src_idx].mV);
+            mTexCoords[dst_idx] = data.tc[src_idx];
 
-	for (U32 i = 0; i < mNumIndices; ++i)
-	{
-		mIndices[i] = new_idx[mIndices[i]];
-	}
-	
-	ll_aligned_free<64>(mPositions);
-	// DO NOT free mNormals and mTexCoords as they are part of mPositions buffer
-	ll_aligned_free_16(mWeights);
-	ll_aligned_free_16(mTangents);
-#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
-    ll_aligned_free_16(mJointIndices);
-    ll_aligned_free_16(mJustWeights);
-    mJustWeights = NULL;
-    mJointIndices = NULL; // filled in later as necessary by skinning code for acceleration
-#endif
+            mMikktSpaceTangents[dst_idx].loadua(data.t[src_idx].mV);
 
-	mPositions = pos;
-	mNormals = norm;
-	mTexCoords = tc;
-	mWeights = wght;    
-	mTangents = binorm;
+            if (mWeights)
+            {
+                mWeights[dst_idx].loadua(data.w[src_idx].mV);
+            }
+        }
+    }
 
-	//std::string result = llformat("ACMR pre/post: %.3f/%.3f  --  %d triangles %d breaks", pre_acmr, post_acmr, mNumIndices/3, breaks);
-	//LL_INFOS() << result << LL_ENDL;
+    // cache optimize index buffer
+
+    // meshopt needs scratch space, do some pointer shuffling to avoid an extra index buffer copy
+    U16* src_indices = mIndices;
+    mIndices = nullptr;
+    resizeIndices(mNumIndices);
+
+    meshopt_optimizeVertexCache<U16>(mIndices, src_indices, mNumIndices, mNumVertices);
+    
+    ll_aligned_free_16(src_indices);
 
 	return true;
 }
@@ -6407,209 +6386,25 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 void CalculateTangentArray(U32 vertexCount, const LLVector4a *vertex, const LLVector4a *normal,
         const LLVector2 *texcoord, U32 triangleCount, const U16* index_array, LLVector4a *tangent);
 
-
-// data structures for tangent generation
-
-// key for summing tangents
-// We will blend tangents wherever a common position and normal is found
-struct MikktKey
-{
-    // Position
-    LLVector3 p;
-    // Normal
-    LLVector3 n;
-
-    bool operator==(const MikktKey& rhs) const { return p == rhs.p && n == rhs.n; }
-};
-
-// sum of tangents and list of signs and index array indices for a given position and normal combination
-// sign must be kept separate from summed tangent because a single position and normal may have a different
-// tangent facing where UV seams exist
-struct MikktTangent
-{
-    // tangent vector
-    LLVector3 t;
-    // signs
-    std::vector<F32> s;
-    // indices (in index array)
-    std::vector<S32> i;
-};
-
-// hash function for MikktTangent
-namespace boost
-{
-    template <>
-    struct hash<LLVector3>
-    {
-        std::size_t operator()(LLVector3 const& k) const
-        {
-            size_t seed = 0;
-            boost::hash_combine(seed, k.mV[0]);
-            boost::hash_combine(seed, k.mV[1]);
-            boost::hash_combine(seed, k.mV[2]);
-            return seed;
-        }
-    };
-
-    template <>
-    struct hash<MikktKey>
-    {
-        std::size_t operator()(MikktKey const& k) const
-        {
-            size_t seed = 0;
-            boost::hash_combine(seed, k.p);
-            boost::hash_combine(seed, k.n);
-            return seed;
-        }
-    };
-}
-
-// boost adapter
-namespace std
-{
-    template<>
-    struct hash<MikktKey>
-    {
-        std::size_t operator()(MikktKey const& k) const
-        {
-            return boost::hash<MikktKey>()(k);
-        }
-    };
-}
-
-struct MikktData
-{
-    LLVolumeFace* face;
-    std::unordered_map<MikktKey, MikktTangent > tangents;
-};
-
-
-void LLVolumeFace::createTangents(bool mikktspace)
+void LLVolumeFace::createTangents()
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_VOLUME;
 
-    auto& tangents = mikktspace ? mMikktSpaceTangents : mTangents;
-
-    if (!tangents)
+    
+    if (!mTangents)
     {
-        allocateTangents(mNumVertices, mikktspace);
+        allocateTangents(mNumVertices);
+        
+        //generate tangents
+        LLVector4a* ptr = (LLVector4a*)mTangents;
 
-        if (mikktspace)
+        LLVector4a* end = mTangents + mNumVertices;
+        while (ptr < end)
         {
-            LL_PROFILE_ZONE_NAMED_CATEGORY_VOLUME("mikktspace");
-            SMikkTSpaceInterface ms;
-
-            ms.m_getNumFaces = [](const SMikkTSpaceContext* pContext)
-            {
-                MikktData* data = (MikktData*)pContext->m_pUserData;
-                LLVolumeFace* face = data->face;
-                return face->mNumIndices / 3;
-            };
-
-            ms.m_getNumVerticesOfFace = [](const SMikkTSpaceContext* pContext, const int iFace)
-            {
-                return 3;
-            };
-
-            ms.m_getPosition = [](const SMikkTSpaceContext* pContext, float fvPosOut[], const int iFace, const int iVert)
-            {
-                MikktData* data = (MikktData*)pContext->m_pUserData;
-                LLVolumeFace* face = data->face;
-                S32 idx = face->mIndices[iFace * 3 + iVert];
-                auto& vert = face->mPositions[idx];
-                F32* v = vert.getF32ptr();
-                fvPosOut[0] = v[0];
-                fvPosOut[1] = v[1];
-                fvPosOut[2] = v[2];
-            };
-
-            ms.m_getNormal = [](const SMikkTSpaceContext* pContext, float fvNormOut[], const int iFace, const int iVert)
-            {
-                MikktData* data = (MikktData*)pContext->m_pUserData;
-                LLVolumeFace* face = data->face;
-                S32 idx = face->mIndices[iFace * 3 + iVert];
-                auto& norm = face->mNormals[idx];
-                F32* n = norm.getF32ptr();
-                fvNormOut[0] = n[0];
-                fvNormOut[1] = n[1];
-                fvNormOut[2] = n[2];
-            };
-
-            ms.m_getTexCoord = [](const SMikkTSpaceContext* pContext, float fvTexcOut[], const int iFace, const int iVert)
-            {
-                MikktData* data = (MikktData*)pContext->m_pUserData;
-                LLVolumeFace* face = data->face;
-                S32 idx = face->mIndices[iFace * 3 + iVert];
-                auto& tc = face->mTexCoords[idx];
-                fvTexcOut[0] = tc.mV[0];
-                fvTexcOut[1] = tc.mV[1];
-            };
-
-            ms.m_setTSpaceBasic = [](const SMikkTSpaceContext* pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert)
-            {
-                MikktData* data = (MikktData*)pContext->m_pUserData;
-                LLVolumeFace* face = data->face;
-                S32 i = iFace * 3 + iVert;
-                S32 idx = face->mIndices[i];
-                
-                LLVector3 p(face->mPositions[idx].getF32ptr());
-                LLVector3 n(face->mNormals[idx].getF32ptr());
-                LLVector3 t(fvTangent);
-
-                MikktKey key = { p, n };
-
-                MikktTangent& mt = data->tangents[key];
-                mt.t += t;
-                mt.s.push_back(fSign);
-                mt.i.push_back(i);
-            };
-
-            ms.m_setTSpace = nullptr;
-
-            MikktData data;
-            data.face = this;
-
-            SMikkTSpaceContext ctx = { &ms, &data };
-
-            genTangSpaceDefault(&ctx);
-
-            for (U32 i = 0; i < mNumVertices; ++i)
-            {
-                MikktKey key = { LLVector3(mPositions[i].getF32ptr()), LLVector3(mNormals[i].getF32ptr()) };
-                MikktTangent& t = data.tangents[key];
-
-                //set tangent
-                mMikktSpaceTangents[i].load3(t.t.mV);
-                mMikktSpaceTangents[i].normalize3fast();
-
-                //set sign
-                F32 sign = 0.f;
-                for (int j = 0; j < t.i.size(); ++j)
-                {
-                    if (mIndices[t.i[j]] == i)
-                    {
-                        sign = t.s[j];
-                        break;
-                    }
-                }
-
-                llassert(sign != 0.f);
-                mMikktSpaceTangents[i].getF32ptr()[3] = sign;
-            }
+            (*ptr++).clear();
         }
-        else
-        {
-            //generate tangents
-            LLVector4a* ptr = (LLVector4a*)tangents;
-
-            LLVector4a* end = mTangents + mNumVertices;
-            while (ptr < end)
-            {
-                (*ptr++).clear();
-            }
 
-            CalculateTangentArray(mNumVertices, mPositions, mNormals, mTexCoords, mNumIndices / 3, mIndices, tangents);
-        }
+        CalculateTangentArray(mNumVertices, mPositions, mNormals, mTexCoords, mNumIndices / 3, mIndices, mTangents);
 
         //normalize normals
         for (U32 i = 0; i < mNumVertices; i++)
@@ -6618,6 +6413,7 @@ void LLVolumeFace::createTangents(bool mikktspace)
             mNormals[i].normalize3fast();
         }
     }
+
 }
 
 void LLVolumeFace::resizeVertices(S32 num_verts)
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index 8c604c5d1a..f1feaade58 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -870,7 +870,7 @@ private:
 public:
 
 	BOOL create(LLVolume* volume, BOOL partial_build = FALSE);
-	void createTangents(bool mikktspace = false);
+	void createTangents();
 	
 	void resizeVertices(S32 num_verts);
 	void allocateTangents(S32 num_verts, bool mikktspace = false);
diff --git a/indra/llprimitive/lldaeloader.cpp b/indra/llprimitive/lldaeloader.cpp
index 50f4a4306e..9470146ce4 100644
--- a/indra/llprimitive/lldaeloader.cpp
+++ b/indra/llprimitive/lldaeloader.cpp
@@ -2551,6 +2551,9 @@ bool LLDAELoader::loadModelsFromDomMesh(domMesh* mesh, std::vector<LLModel*>& mo
 	LLVolume::face_list_t remainder;
 	do 
 	{
+        // generate tangents and cache optimize before normalizing
+        ret->preprocessVolumeFaces();
+
 		// Insure we do this once with the whole gang and not per-model
 		//
 		if (!normalized && !mNoNormalize)
@@ -2561,10 +2564,11 @@ bool LLDAELoader::loadModelsFromDomMesh(domMesh* mesh, std::vector<LLModel*>& mo
 
 		ret->trimVolumeFacesToSize(LL_SCULPT_MESH_MAX_FACES, &remainder);
 
-		if (!mNoOptimize)
-		{
-			ret->remapVolumeFaces();
-		}
+        // remove unused/redundant vertices after normalizing
+		//if (!mNoOptimize)
+		//{
+		//	ret->remapVolumeFaces();
+		//}
 
 		volume_faces = remainder.size();
 
diff --git a/indra/llprimitive/llmodel.cpp b/indra/llprimitive/llmodel.cpp
index 285c5f656b..1ce287d773 100644
--- a/indra/llprimitive/llmodel.cpp
+++ b/indra/llprimitive/llmodel.cpp
@@ -187,6 +187,15 @@ void LLModel::trimVolumeFacesToSize(U32 new_count, LLVolume::face_list_t* remain
 	}
 }
 
+// generate mikkt space tangents and cache optimize
+void LLModel::preprocessVolumeFaces()
+{
+    for (auto& face : mVolumeFaces)
+    {
+        face.cacheOptimize();
+    }
+}
+
 // Shrink the model to fit
 // on a 1x1x1 cube centered at the origin.
 // The positions and extents
@@ -296,6 +305,7 @@ void LLModel::normalizeVolumeFaces()
 			// the positions to fit within the unit cube.
 			LLVector4a* pos = (LLVector4a*) face.mPositions;
 			LLVector4a* norm = (LLVector4a*) face.mNormals;
+            LLVector4a* t = (LLVector4a*)face.mMikktSpaceTangents;
 
 			for (U32 j = 0; j < face.mNumVertices; ++j)
 			{
@@ -306,6 +316,14 @@ void LLModel::normalizeVolumeFaces()
 					norm[j].mul(inv_scale);
 					norm[j].normalize3();
 				}
+
+                if (t)
+                {
+                    F32 w = t[j].getF32ptr()[3];
+                    t[j].mul(inv_scale);
+                    t[j].normalize3();
+                    t[j].getF32ptr()[3] = w;
+                }
 			}
 		}
 
@@ -726,10 +744,12 @@ LLSD LLModel::writeModel(
 				LLSD::Binary verts(face.mNumVertices*3*2);
 				LLSD::Binary tc(face.mNumVertices*2*2);
 				LLSD::Binary normals(face.mNumVertices*3*2);
+                LLSD::Binary tangents(face.mNumVertices * 4 * 2);
 				LLSD::Binary indices(face.mNumIndices*2);
 
 				U32 vert_idx = 0;
 				U32 norm_idx = 0;
+                U32 tan_idx = 0;
 				U32 tc_idx = 0;
 			
 				LLVector2* ftc = (LLVector2*) face.mTexCoords;
@@ -782,6 +802,22 @@ LLSD LLModel::writeModel(
 							normals[norm_idx++] = buff[1];
 						}
 					}
+
+                    if (face.mMikktSpaceTangents)
+                    { //normals
+                        F32* tangent = face.mMikktSpaceTangents[j].getF32ptr();
+
+                        for (U32 k = 0; k < 4; ++k)
+                        { //for each component
+                            //convert to 16-bit normalized
+                            U16 val = (U16)((tangent[k] + 1.f) * 0.5f * 65535);
+                            U8* buff = (U8*)&val;
+
+                            //write to binary buffer
+                            tangents[tan_idx++] = buff[0];
+                            tangents[tan_idx++] = buff[1];
+                        }
+                    }
 					
 					//texcoord
 					if (face.mTexCoords)
@@ -819,6 +855,11 @@ LLSD LLModel::writeModel(
 					mdl[model_names[idx]][i]["Normal"] = normals;
 				}
 
+                if (face.mMikktSpaceTangents)
+                {
+                    mdl[model_names[idx]][i]["Tangent"] = tangents;
+                }
+
 				if (face.mTexCoords)
 				{
 					mdl[model_names[idx]][i]["TexCoord0Domain"]["Min"] = min_tc.getValue();
diff --git a/indra/llprimitive/llmodel.h b/indra/llprimitive/llmodel.h
index 354ceb26b7..ea97851ce8 100644
--- a/indra/llprimitive/llmodel.h
+++ b/indra/llprimitive/llmodel.h
@@ -182,6 +182,7 @@ public:
 	void addFace(const LLVolumeFace& face);
 
 	void sortVolumeFacesByMaterialName();
+    void preprocessVolumeFaces();
 	void normalizeVolumeFaces();
 	void trimVolumeFacesToSize(U32 new_count = LL_SCULPT_MESH_MAX_FACES, LLVolume::face_list_t* remainder = NULL);
     void remapVolumeFaces();
diff --git a/indra/newview/app_settings/shaders/class1/deferred/pbropaqueF.glsl b/indra/newview/app_settings/shaders/class1/deferred/pbropaqueF.glsl
index 69019667de..f0f5208f52 100644
--- a/indra/newview/app_settings/shaders/class1/deferred/pbropaqueF.glsl
+++ b/indra/newview/app_settings/shaders/class1/deferred/pbropaqueF.glsl
@@ -25,61 +25,34 @@
 
 /*[EXTRA_CODE_HERE]*/
 
-#define DEBUG_PBR_LIGHT_TYPE 0 // Output Diffuse=0.75, Emissive=0, ORM=0,0,0
-
-#define DEBUG_BASIC         0
-#define DEBUG_VERTEX        0
-#define DEBUG_NORMAL_MAP    0 // Output packed normal map "as is" to diffuse
-#define DEBUG_NORMAL_OUT    0 // Output unpacked normal to diffuse
-#define DEBUG_ORM           0 // Output Occlusion Roughness Metal "as is" to diffuse
-#define DEBUG_POSITION      0
-
 uniform sampler2D diffuseMap;  //always in sRGB space
 
 uniform float metallicFactor;
 uniform float roughnessFactor;
 uniform vec3 emissiveColor;
+uniform sampler2D bumpMap;
+uniform sampler2D emissiveMap;
+uniform sampler2D specularMap; // Packed: Occlusion, Metal, Roughness
 
-#ifdef HAS_NORMAL_MAP
-    uniform sampler2D bumpMap;
-    VARYING vec3 vary_tangent;
-    flat in float vary_sign;
-#endif
-
-#ifdef HAS_EMISSIVE_MAP
-    uniform sampler2D emissiveMap;
-#endif
-
-#ifdef HAS_SPECULAR_MAP
-    uniform sampler2D specularMap; // Packed: Occlusion, Metal, Roughness
-#endif
-
-uniform samplerCube environmentMap;
-uniform mat3        env_mat;
-
-#ifdef DEFINE_GL_FRAGCOLOR
 out vec4 frag_data[4];
-#else
-#define frag_data gl_FragData
-#endif
 
 VARYING vec3 vary_position;
 VARYING vec4 vertex_color;
-VARYING vec2 vary_texcoord0;
-#ifdef HAS_NORMAL_MAP
 VARYING vec3 vary_normal;
-VARYING vec2 vary_texcoord1;
-#endif
+VARYING vec3 vary_tangent;
+flat in float vary_sign;
 
-#ifdef HAS_SPECULAR_MAP
-    VARYING vec2 vary_texcoord2;
-#endif
+VARYING vec2 vary_texcoord0;
+VARYING vec2 vary_texcoord1;
+VARYING vec2 vary_texcoord2;
 
 uniform float minimum_alpha; // PBR alphaMode: MASK, See: mAlphaCutoff, setAlphaCutoff()
 
 vec2 encode_normal(vec3 n);
 vec3 linear_to_srgb(vec3 c);
 
+uniform mat3 normal_matrix;
+
 void main()
 {
 // IF .mFeatures.mIndexedTextureChannels = LLGLSLShader::sIndexedTextureChannels;
@@ -94,11 +67,11 @@ void main()
     vec3 col = vertex_color.rgb * albedo.rgb;
 
     // from mikktspace.com
-    vec4 vNt = texture2D(bumpMap, vary_texcoord1.xy)*2.0-1.0;
+    vec3 vNt = texture2D(bumpMap, vary_texcoord1.xy).xyz*2.0-1.0;
     float sign = vary_sign;
     vec3 vN = vary_normal;
     vec3 vT = vary_tangent.xyz;
-
+    
     vec3 vB = sign * cross(vN, vT);
     vec3 tnorm = normalize( vNt.x * vT + vNt.y * vB + vNt.z * vN );
 
@@ -107,49 +80,22 @@ void main()
     //   occlusion 1.0
     //   roughness 0.0
     //   metal     0.0
-#ifdef HAS_SPECULAR_MAP
     vec3 spec = texture2D(specularMap, vary_texcoord2.xy).rgb;
-#else
-    vec3 spec = vec3(1,0,0);
-#endif
     
     spec.g *= roughnessFactor;
     spec.b *= metallicFactor;
 
     vec3 emissive = emissiveColor;
-#ifdef HAS_EMISSIVE_MAP
     emissive *= texture2D(emissiveMap, vary_texcoord0.xy).rgb;
-#endif
-
-#if DEBUG_PBR_LIGHT_TYPE
-    col.rgb  = vec3(0.75);
-    emissive = vec3(0);
-    spec.rgb = vec3(0);
-#endif
-#if DEBUG_BASIC
-    col.rgb = vec3( 1, 0, 1 );
-#endif
-#if DEBUG_VERTEX
-    col.rgb = vertex_color.rgb;
-#endif
-#if DEBUG_NORMAL_MAP
-    col.rgb = texture2D(bumpMap, vary_texcoord1.xy).rgb;
-#endif
-#if DEBUG_NORMAL_OUT
-    col.rgb = vary_normal;
-#endif
-#if DEBUG_ORM
-    col.rgb = linear_to_srgb(spec);
-#endif
-#if DEBUG_POSITION
-    col.rgb = vary_position.xyz;
-#endif
 
     tnorm *= gl_FrontFacing ? 1.0 : -1.0;
 
+    //spec.rgb = vec3(1,1,0);
     //col = vec3(0,0,0);
     //emissive = vary_tangent.xyz*0.5+0.5;
-    //emissive = vec3(vary_sign*0.5+0.5);
+    //emissive = vec3(sign*0.5+0.5);
+    //emissive = vNt * 0.5 + 0.5;
+    //emissive = tnorm*0.5+0.5;
     // See: C++: addDeferredAttachments(), GLSL: softenLightF
     frag_data[0] = vec4(col, 0.0);                                                   // Diffuse
     frag_data[1] = vec4(emissive, vertex_color.a);                                   // PBR sRGB Emissive
diff --git a/indra/newview/app_settings/shaders/class1/deferred/pbropaqueV.glsl b/indra/newview/app_settings/shaders/class1/deferred/pbropaqueV.glsl
index e17d91af38..5573c02a60 100644
--- a/indra/newview/app_settings/shaders/class1/deferred/pbropaqueV.glsl
+++ b/indra/newview/app_settings/shaders/class1/deferred/pbropaqueV.glsl
@@ -37,41 +37,24 @@ uniform mat3 normal_matrix;
 uniform mat4 modelview_projection_matrix;
 #endif
 
-#if (DIFFUSE_ALPHA_MODE == DIFFUSE_ALPHA_MODE_BLEND)
-
-#if !defined(HAS_SKIN)
-uniform mat4 modelview_matrix;
-#endif
-
-VARYING vec3 vary_position;
-
-#endif
-
 uniform mat4 texture_matrix0;
 
 ATTRIBUTE vec3 position;
 ATTRIBUTE vec4 diffuse_color;
 ATTRIBUTE vec3 normal;
-ATTRIBUTE vec2 texcoord0;
-
-
-#ifdef HAS_NORMAL_MAP
 ATTRIBUTE vec4 tangent;
+ATTRIBUTE vec2 texcoord0;
 ATTRIBUTE vec2 texcoord1;
+ATTRIBUTE vec2 texcoord2;
 
+VARYING vec2 vary_texcoord0;
 VARYING vec2 vary_texcoord1;
-#endif
-
-#ifdef HAS_SPECULAR_MAP
-ATTRIBUTE vec2 texcoord2;
 VARYING vec2 vary_texcoord2;
-#endif
  
 VARYING vec4 vertex_color;
-VARYING vec2 vary_texcoord0;
+
 VARYING vec3 vary_tangent;
 flat out float vary_sign;
-
 VARYING vec3 vary_normal;
 
 void main()
@@ -83,64 +66,27 @@ void main()
 
 	vec3 pos = (mat*vec4(position.xyz,1.0)).xyz;
 
-#if (DIFFUSE_ALPHA_MODE == DIFFUSE_ALPHA_MODE_BLEND)
-	vary_position = pos;
-#endif
-
 	gl_Position = projection_matrix*vec4(pos,1.0);
 
 #else
 	//transform vertex
 	gl_Position = modelview_projection_matrix * vec4(position.xyz, 1.0); 
-
 #endif
 	
 	vary_texcoord0 = (texture_matrix0 * vec4(texcoord0,0,1)).xy;
-	
-#ifdef HAS_NORMAL_MAP
 	vary_texcoord1 = (texture_matrix0 * vec4(texcoord1,0,1)).xy;
-#endif
-
-#ifdef HAS_SPECULAR_MAP
 	vary_texcoord2 = (texture_matrix0 * vec4(texcoord2,0,1)).xy;
-#endif
-
 #ifdef HAS_SKIN
-	vec3 n = normalize((mat*vec4(normal.xyz+position.xyz,1.0)).xyz-pos.xyz);
-#ifdef HAS_NORMAL_MAP
-	vec3 t = normalize((mat*vec4(tangent.xyz+position.xyz,1.0)).xyz-pos.xyz);
-	vec3 b = cross(n, t)*tangent.w;
-	
-	//vary_mat0 = vec3(t.x, b.x, n.x);
-	//vary_mat1 = vec3(t.y, b.y, n.y);
-	//vary_mat2 = vec3(t.z, b.z, n.z);
-#else //HAS_NORMAL_MAP
-vary_normal  = n;
-#endif //HAS_NORMAL_MAP
+	vec3 n = (mat*vec4(normal.xyz+position.xyz,1.0)).xyz-pos.xyz;
+	vec3 t = (mat*vec4(tangent.xyz+position.xyz,1.0)).xyz-pos.xyz;
 #else //HAS_SKIN
-	vec3 n = normalize(normal_matrix * normal);
-#ifdef HAS_NORMAL_MAP
-	vec3 t = normalize(normal_matrix * tangent.xyz);
-    vary_tangent = t;
-    vary_sign = tangent.w;
-    vary_normal = n;
+	vec3 n = normal_matrix * normal;
+	vec3 t = normal_matrix * tangent.xyz;
+#endif
 
-	//vec3 b = cross(n,t)*tangent.w;
-	//vec3 t = cross(b,n) * binormal.w;
-	
-	//vary_mat0 = vec3(t.x, b.x, n.x);
-	//vary_mat1 = vec3(t.y, b.y, n.y);
-	//vary_mat2 = vec3(t.z, b.z, n.z);
-#else //HAS_NORMAL_MAP
-	vary_normal = n;
-#endif //HAS_NORMAL_MAP
-#endif //HAS_SKIN
+    vary_tangent = normalize(t);
+    vary_sign = tangent.w;
+    vary_normal = normalize(n);
 	
 	vertex_color = diffuse_color;
-
-#if (DIFFUSE_ALPHA_MODE == DIFFUSE_ALPHA_MODE_BLEND)
-#if !defined(HAS_SKIN)
-	vary_position = (modelview_matrix*vec4(position.xyz, 1.0)).xyz;
-#endif
-#endif
 }
diff --git a/indra/newview/llface.cpp b/indra/newview/llface.cpp
index 33d6a205e7..3f69815e38 100644
--- a/indra/newview/llface.cpp
+++ b/indra/newview/llface.cpp
@@ -2177,6 +2177,11 @@ BOOL LLFace::getGeometryVolume(const LLVolume& volume,
 			mask.setElement<3>();
 
             LLVector4a* tbuff = mikktspace ? vf.mMikktSpaceTangents : vf.mTangents;
+            if (tbuff == nullptr)
+            { // non-mesh prims will not have mikktspace tangents
+                tbuff = vf.mTangents;
+            }
+
 			LLVector4a* src = tbuff;
 			LLVector4a* end = tbuff+num_vertices;
 
@@ -2184,7 +2189,6 @@ BOOL LLFace::getGeometryVolume(const LLVolume& volume,
 			{
 				LLVector4a tangent_out;
 				mat_normal.rotate(*src, tangent_out);
-				tangent_out.normalize3fast();
 				tangent_out.setSelectWithMask(mask, *src, tangent_out);
 				tangent_out.store4a(tangents);
 				
diff --git a/indra/newview/llmodelpreview.cpp b/indra/newview/llmodelpreview.cpp
index c3fbada9db..2c0f0ae443 100644
--- a/indra/newview/llmodelpreview.cpp
+++ b/indra/newview/llmodelpreview.cpp
@@ -1308,9 +1308,10 @@ F32 LLModelPreview::genMeshOptimizerPerModel(LLModel *base_model, LLModel *targe
 
     // extra space for normals and text coords
     S32 tc_bytes_size = ((size_vertices * sizeof(LLVector2)) + 0xF) & ~0xF;
-    LLVector4a* combined_positions = (LLVector4a*)ll_aligned_malloc<64>(sizeof(LLVector4a) * 2 * size_vertices + tc_bytes_size);
+    LLVector4a* combined_positions = (LLVector4a*)ll_aligned_malloc<64>(sizeof(LLVector4a) * 3 * size_vertices + tc_bytes_size);
     LLVector4a* combined_normals = combined_positions + size_vertices;
-    LLVector2* combined_tex_coords = (LLVector2*)(combined_normals + size_vertices);
+    LLVector4a* combined_tangents = combined_normals + size_vertices;
+    LLVector2* combined_tex_coords = (LLVector2*)(combined_tangents + size_vertices);
 
     // copy indices and vertices into new buffers
     S32 combined_positions_shift = 0;
@@ -1320,6 +1321,9 @@ F32 LLModelPreview::genMeshOptimizerPerModel(LLModel *base_model, LLModel *targe
     {
         const LLVolumeFace &face = base_model->getVolumeFace(face_idx);
 
+        // ensure tangents have been generated or loaded
+        llassert(face.mMikktSpaceTangents);
+
         // Vertices
         S32 copy_bytes = face.mNumVertices * sizeof(LLVector4a);
         LLVector4a::memcpyNonAliased16((F32*)(combined_positions + combined_positions_shift), (F32*)face.mPositions, copy_bytes);
@@ -1327,6 +1331,9 @@ F32 LLModelPreview::genMeshOptimizerPerModel(LLModel *base_model, LLModel *targe
         // Normals
         LLVector4a::memcpyNonAliased16((F32*)(combined_normals + combined_positions_shift), (F32*)face.mNormals, copy_bytes);
 
+        // Tangents
+        LLVector4a::memcpyNonAliased16((F32*)(combined_tangents + combined_positions_shift), (F32*)face.mMikktSpaceTangents, copy_bytes);
+
         // Tex coords
         copy_bytes = face.mNumVertices * sizeof(LLVector2);
         memcpy((void*)(combined_tex_coords + combined_positions_shift), (void*)face.mTexCoords, copy_bytes);
@@ -1428,9 +1435,10 @@ F32 LLModelPreview::genMeshOptimizerPerModel(LLModel *base_model, LLModel *targe
 
     // IV. Repack back into individual faces
 
-    LLVector4a* buffer_positions = (LLVector4a*)ll_aligned_malloc<64>(sizeof(LLVector4a) * 2 * size_vertices + tc_bytes_size);
+    LLVector4a* buffer_positions = (LLVector4a*)ll_aligned_malloc<64>(sizeof(LLVector4a) * 3 * size_vertices + tc_bytes_size);
     LLVector4a* buffer_normals = buffer_positions + size_vertices;
-    LLVector2* buffer_tex_coords = (LLVector2*)(buffer_normals + size_vertices);
+    LLVector4a* buffer_tangents = buffer_normals + size_vertices;
+    LLVector2* buffer_tex_coords = (LLVector2*)(buffer_tangents + size_vertices);
     S32 buffer_idx_size = (size_indices * sizeof(U16) + 0xF) & ~0xF;
     U16* buffer_indices = (U16*)ll_aligned_malloc_16(buffer_idx_size);
     S32* old_to_new_positions_map = new S32[size_vertices];
@@ -1511,6 +1519,7 @@ F32 LLModelPreview::genMeshOptimizerPerModel(LLModel *base_model, LLModel *targe
                     // Copy vertice, normals, tcs
                     buffer_positions[buf_positions_copied] = combined_positions[idx];
                     buffer_normals[buf_positions_copied] = combined_normals[idx];
+                    buffer_tangents[buf_positions_copied] = combined_tangents[idx];
                     buffer_tex_coords[buf_positions_copied] = combined_tex_coords[idx];
 
                     old_to_new_positions_map[idx] = buf_positions_copied;
@@ -1549,12 +1558,13 @@ F32 LLModelPreview::genMeshOptimizerPerModel(LLModel *base_model, LLModel *targe
         {
             new_face.resizeIndices(buf_indices_copied);
             new_face.resizeVertices(buf_positions_copied);
-
+            new_face.allocateTangents(buf_positions_copied, true);
             S32 idx_size = (buf_indices_copied * sizeof(U16) + 0xF) & ~0xF;
             LLVector4a::memcpyNonAliased16((F32*)new_face.mIndices, (F32*)buffer_indices, idx_size);
 
             LLVector4a::memcpyNonAliased16((F32*)new_face.mPositions, (F32*)buffer_positions, buf_positions_copied * sizeof(LLVector4a));
             LLVector4a::memcpyNonAliased16((F32*)new_face.mNormals, (F32*)buffer_normals, buf_positions_copied * sizeof(LLVector4a));
+            LLVector4a::memcpyNonAliased16((F32*)new_face.mMikktSpaceTangents, (F32*)buffer_tangents, buf_positions_copied * sizeof(LLVector4a));
 
             U32 tex_size = (buf_positions_copied * sizeof(LLVector2) + 0xF)&~0xF;
             LLVector4a::memcpyNonAliased16((F32*)new_face.mTexCoords, (F32*)buffer_tex_coords, tex_size);