diff options
author | Dave Parks <davep@lindenlab.com> | 2010-05-21 04:49:12 -0500 |
---|---|---|
committer | Dave Parks <davep@lindenlab.com> | 2010-05-21 04:49:12 -0500 |
commit | 05a23f8dbaa45c64bcf6c55dd09a468ba2b1f144 (patch) | |
tree | 2e4ad0ae03f1c48e1a325501ebdb96bd843fde67 | |
parent | bf5f215fbc29102cfd8b5418f29ea0ed6edd14ee (diff) |
Vectorized memcpy.
16-byte aligned vertex buffers.
(almost) fully vectorized avatar vertex buffer updating --- index buffers still need to be vectorized
-rw-r--r-- | indra/llrender/llvertexbuffer.cpp | 195 | ||||
-rw-r--r-- | indra/llrender/llvertexbuffer.h | 7 | ||||
-rw-r--r-- | indra/newview/lldrawpoolavatar.cpp | 4 | ||||
-rw-r--r-- | indra/newview/llpolymesh.cpp | 25 | ||||
-rw-r--r-- | indra/newview/llviewerjointmesh.cpp | 169 |
5 files changed, 273 insertions, 127 deletions
diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp index 7fa47cd171..a50eb7211c 100644 --- a/indra/llrender/llvertexbuffer.cpp +++ b/indra/llrender/llvertexbuffer.cpp @@ -39,6 +39,7 @@ #include "llglheaders.h" #include "llmemtype.h" #include "llrender.h" +#include "llvector4a.h" //============================================================================ @@ -66,6 +67,27 @@ S32 LLVertexBuffer::sWeight4Loc = -1; std::vector<U32> LLVertexBuffer::sDeleteList; +#define LL_ALIGNED_VB 1 + +#if LL_ALIGNED_VB + +S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] = +{ + sizeof(LLVector4), // TYPE_VERTEX, + sizeof(LLVector4), // TYPE_NORMAL, + sizeof(LLVector2), // TYPE_TEXCOORD0, + sizeof(LLVector2), // TYPE_TEXCOORD1, + sizeof(LLVector2), // TYPE_TEXCOORD2, + sizeof(LLVector2), // TYPE_TEXCOORD3, + sizeof(LLColor4U), // TYPE_COLOR, + sizeof(LLVector4), // TYPE_BINORMAL, + sizeof(F32), // TYPE_WEIGHT, + sizeof(LLVector4), // TYPE_WEIGHT4, + sizeof(LLVector4), // TYPE_CLOTHWEIGHT, +}; + +#else + S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] = { sizeof(LLVector3), // TYPE_VERTEX, @@ -81,6 +103,8 @@ S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] = sizeof(LLVector4), // TYPE_CLOTHWEIGHT, }; +#endif + U32 LLVertexBuffer::sGLMode[LLRender::NUM_MODES] = { GL_TRIANGLES, @@ -428,11 +452,41 @@ LLVertexBuffer::LLVertexBuffer(U32 typemask, S32 usage) : mTypeMask = typemask; mStride = stride; + mAlignedOffset = 0; + sCount++; } +#if LL_ALIGNED_VB +//static +S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets, S32 num_vertices) +{ + S32 offset = 0; + for (S32 i=0; i<TYPE_MAX; i++) + { + U32 mask = 1<<i; + if (typemask & mask) + { + if (offsets) + { + offsets[i] = offset; + offset += LLVertexBuffer::sTypeOffsets[i]*num_vertices; + offset = (offset + 0xF) & ~0xF; + } + } + } + + return offset+16; +} + +S32 LLVertexBuffer::getSize() const +{ + return mStride; +} + +#else //static -S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets) +S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets, S32 num_vertices) { S32 stride = 0; for (S32 i=0; i<TYPE_MAX; i++) @@ -451,6 +505,12 @@ S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets) return stride; } +S32 LLVertexBuffer::getSize() const +{ + return mNumVerts*mStride; +} + +#endif // protected, use unref() //virtual LLVertexBuffer::~LLVertexBuffer() @@ -560,7 +620,7 @@ void LLVertexBuffer::createGLBuffer() { static int gl_buffer_idx = 0; mGLBuffer = ++gl_buffer_idx; - mMappedData = new U8[size]; + mMappedData = (U8*) _mm_malloc(size, 16); memset(mMappedData, 0, size); } } @@ -612,7 +672,7 @@ void LLVertexBuffer::destroyGLBuffer() } else { - delete [] mMappedData; + _mm_free(mMappedData); mMappedData = NULL; mEmpty = TRUE; } @@ -664,7 +724,7 @@ void LLVertexBuffer::updateNumVerts(S32 nverts) } mRequestedNumVerts = nverts; - + if (!mDynamicSize) { mNumVerts = nverts; @@ -679,6 +739,9 @@ void LLVertexBuffer::updateNumVerts(S32 nverts) } mNumVerts = nverts; } +#if LL_ALIGNED_VB + mStride = calcStride(mTypeMask, mOffsets, mNumVerts); +#endif } @@ -886,7 +949,11 @@ U8* LLVertexBuffer::mapBuffer(S32 access) setBuffer(0); mLocked = TRUE; stop_glerror(); - mMappedData = (U8*) glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB); + + U8* src = (U8*) glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB); + mMappedData = LL_NEXT_ALIGNED_ADDRESS<U8>(src); + mAlignedOffset = mMappedData - src; + stop_glerror(); } { @@ -975,6 +1042,45 @@ void LLVertexBuffer::unmapBuffer() //---------------------------------------------------------------------------- +#if LL_ALIGNED_VB + +template <class T,S32 type> struct VertexBufferStrider +{ + typedef LLStrider<T> strider_t; + static bool get(LLVertexBuffer& vbo, + strider_t& strider, + S32 index) + { + if (vbo.mapBuffer() == NULL) + { + llwarns << "mapBuffer failed!" << llendl; + return FALSE; + } + + if (type == LLVertexBuffer::TYPE_INDEX) + { + S32 stride = sizeof(T); + strider = (T*)(vbo.getMappedIndices() + index*stride); + strider.setStride(0); + return TRUE; + } + else if (vbo.hasDataType(type)) + { + S32 stride = LLVertexBuffer::sTypeOffsets[type]; + strider = (T*)(vbo.getMappedData() + vbo.getOffset(type)+index*stride); + strider.setStride(stride); + return TRUE; + } + else + { + llerrs << "VertexBufferStrider could not find valid vertex data." << llendl; + } + return FALSE; + } +}; + +#else + template <class T,S32 type> struct VertexBufferStrider { typedef LLStrider<T> strider_t; @@ -1010,6 +1116,7 @@ template <class T,S32 type> struct VertexBufferStrider } }; +#endif bool LLVertexBuffer::getVertexStrider(LLStrider<LLVector3>& strider, S32 index) { @@ -1272,6 +1379,82 @@ void LLVertexBuffer::setBuffer(U32 data_mask) } } +#if LL_ALIGNED_VB + +// virtual (default) +void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const +{ + LLMemType mt2(LLMemType::MTYPE_VERTEX_SETUP_VERTEX_BUFFER); + stop_glerror(); + U8* base = useVBOs() ? (U8*) mAlignedOffset : mMappedData; + + if ((data_mask & mTypeMask) != data_mask) + { + llerrs << "LLVertexBuffer::setupVertexBuffer missing required components for supplied data mask." << llendl; + } + + + if (data_mask & MAP_NORMAL) + { + glNormalPointer(GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_NORMAL], (void*)(base + mOffsets[TYPE_NORMAL])); + } + if (data_mask & MAP_TEXCOORD3) + { + glClientActiveTextureARB(GL_TEXTURE3_ARB); + glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD3], (void*)(base + mOffsets[TYPE_TEXCOORD3])); + glClientActiveTextureARB(GL_TEXTURE0_ARB); + } + if (data_mask & MAP_TEXCOORD2) + { + glClientActiveTextureARB(GL_TEXTURE2_ARB); + glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD2], (void*)(base + mOffsets[TYPE_TEXCOORD2])); + glClientActiveTextureARB(GL_TEXTURE0_ARB); + } + if (data_mask & MAP_TEXCOORD1) + { + glClientActiveTextureARB(GL_TEXTURE1_ARB); + glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD1], (void*)(base + mOffsets[TYPE_TEXCOORD1])); + glClientActiveTextureARB(GL_TEXTURE0_ARB); + } + if (data_mask & MAP_BINORMAL) + { + glClientActiveTextureARB(GL_TEXTURE2_ARB); + glTexCoordPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_BINORMAL], (void*)(base + mOffsets[TYPE_BINORMAL])); + glClientActiveTextureARB(GL_TEXTURE0_ARB); + } + if (data_mask & MAP_TEXCOORD0) + { + glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD0], (void*)(base + mOffsets[TYPE_TEXCOORD0])); + } + if (data_mask & MAP_COLOR) + { + glColorPointer(4, GL_UNSIGNED_BYTE, LLVertexBuffer::sTypeOffsets[TYPE_COLOR], (void*)(base + mOffsets[TYPE_COLOR])); + } + + if (data_mask & MAP_WEIGHT) + { + glVertexAttribPointerARB(1, 1, GL_FLOAT, FALSE, LLVertexBuffer::sTypeOffsets[TYPE_WEIGHT], (void*)(base + mOffsets[TYPE_WEIGHT])); + } + + if (data_mask & MAP_WEIGHT4 && sWeight4Loc != -1) + { + glVertexAttribPointerARB(sWeight4Loc, 4, GL_FLOAT, FALSE, LLVertexBuffer::sTypeOffsets[TYPE_WEIGHT4], (void*)(base+mOffsets[TYPE_WEIGHT4])); + } + + if (data_mask & MAP_CLOTHWEIGHT) + { + glVertexAttribPointerARB(4, 4, GL_FLOAT, TRUE, LLVertexBuffer::sTypeOffsets[TYPE_CLOTHWEIGHT], (void*)(base + mOffsets[TYPE_CLOTHWEIGHT])); + } + if (data_mask & MAP_VERTEX) + { + glVertexPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_VERTEX], (void*)(base + 0)); + } + + llglassertok(); +} + +#else + // virtual (default) void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const { @@ -1344,6 +1527,8 @@ void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const llglassertok(); } +#endif + void LLVertexBuffer::markDirty(U32 vert_index, U32 vert_count, U32 indices_index, U32 indices_count) { // TODO: use GL_APPLE_flush_buffer_range here diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h index d1700aa54a..03799af978 100644 --- a/indra/llrender/llvertexbuffer.h +++ b/indra/llrender/llvertexbuffer.h @@ -98,7 +98,7 @@ public: //if offsets is not NULL, its contents will be filled //with the offset of each vertex component in the buffer, // indexed by the following enum - static S32 calcStride(const U32& typemask, S32* offsets = NULL); + static S32 calcStride(const U32& typemask, S32* offsets = NULL, S32 num_vertices = 0); enum { TYPE_VERTEX, @@ -192,7 +192,7 @@ public: S32 getStride() const { return mStride; } U32 getTypeMask() const { return mTypeMask; } BOOL hasDataType(S32 type) const { return ((1 << type) & getTypeMask()) ? TRUE : FALSE; } - S32 getSize() const { return mNumVerts*mStride; } + S32 getSize() const; S32 getIndicesSize() const { return mNumIndices * sizeof(U16); } U8* getMappedData() const { return mMappedData; } U8* getMappedIndices() const { return mMappedIndexData; } @@ -213,6 +213,7 @@ protected: S32 mRequestedNumVerts; // Number of vertices requested S32 mRequestedNumIndices; // Number of indices requested + ptrdiff_t mAlignedOffset; S32 mStride; U32 mTypeMask; S32 mUsage; // GL usage @@ -227,7 +228,7 @@ protected: S32 mOffsets[TYPE_MAX]; BOOL mResized; // if TRUE, client buffer has been resized and GL buffer has not BOOL mDynamicSize; // if TRUE, buffer has been resized at least once (and should be padded) - + class DirtyRegion { public: diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp index d1f4be71f5..1e9053239d 100644 --- a/indra/newview/lldrawpoolavatar.cpp +++ b/indra/newview/lldrawpoolavatar.cpp @@ -1542,7 +1542,7 @@ LLVertexBufferAvatar::LLVertexBufferAvatar() void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const { - if (sRenderingSkinned) +/* if (sRenderingSkinned) { U8* base = useVBOs() ? NULL : mMappedData; @@ -1562,7 +1562,7 @@ void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const set_vertex_clothing_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_CLOTHING], mStride, (LLVector4*)(base + mOffsets[TYPE_CLOTHWEIGHT])); } } - else + else*/ { LLVertexBuffer::setupVertexBuffer(data_mask); } diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp index b8bdbfb2f8..98c0191397 100644 --- a/indra/newview/llpolymesh.cpp +++ b/indra/newview/llpolymesh.cpp @@ -140,7 +140,7 @@ void LLPolyMeshSharedData::freeMeshData() delete [] mDetailTexCoords; mDetailTexCoords = NULL; - delete [] mWeights; + _mm_free(mWeights); mWeights = NULL; } @@ -230,7 +230,7 @@ BOOL LLPolyMeshSharedData::allocateVertexData( U32 numVertices ) mBaseBinormals = new LLVector3[ numVertices ]; mTexCoords = new LLVector2[ numVertices ]; mDetailTexCoords = new LLVector2[ numVertices ]; - mWeights = new F32[ numVertices ]; + mWeights = (F32*) _mm_malloc((numVertices*sizeof(F32)+0xF) & ~0xF, 16); for (i = 0; i < numVertices; i++) { mWeights[i] = 0.f; @@ -717,13 +717,20 @@ LLPolyMesh::LLPolyMesh(LLPolyMeshSharedData *shared_data, LLPolyMesh *reference_ //use aligned vertex data to make LLPolyMesh SSE friendly mVertexData = (F32*) _mm_malloc(nfloats*4, 16); int offset = 0; - mCoords = (LLVector4*)(mVertexData + offset); offset += 4*nverts; - mNormals = (LLVector4*)(mVertexData + offset); offset += 4*nverts; - mScaledNormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; - mBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; - mScaledBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; - mTexCoords = (LLVector2*)(mVertexData + offset); offset += 2*nverts; - mClothingWeights = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + + //all members must be 16-byte aligned except the last 3 + mCoords = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + mNormals = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + mClothingWeights = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + mTexCoords = (LLVector2*)(mVertexData + offset); offset += 2*nverts; + + // these members don't need to be 16-byte aligned, but the first one might be + // read during an aligned memcpy of mTexCoords + mScaledNormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; + mBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; + mScaledBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; + + #else mCoords = new LLVector3[mSharedData->mNumVertices]; mNormals = new LLVector3[mSharedData->mNumVertices]; diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp index 236ad98d68..a7e7bfadd6 100644 --- a/indra/newview/llviewerjointmesh.cpp +++ b/indra/newview/llviewerjointmesh.cpp @@ -655,6 +655,9 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy) //----------------------------------------------------------------------------- void LLViewerJointMesh::updateFaceSizes(U32 &num_vertices, U32& num_indices, F32 pixel_area) { + //bump num_vertices to next multiple of 4 + num_vertices = (num_vertices + 0x3) & ~0x3; + // Do a pre-alloc pass to determine sizes of data. if (mMesh && mValid) { @@ -677,6 +680,8 @@ static LLFastTimer::DeclareTimer FTM_AVATAR_FACE("Avatar Face"); void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_wind, bool terse_update) { + //IF THIS FUNCTION BREAKS, SEE LLPOLYMESH CONSTRUCTOR AND CHECK ALIGNMENT OF INPUT ARRAYS + mFace = face; if (mFace->mVertexBuffer.isNull()) @@ -684,6 +689,16 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w return; } + LLDrawPool *poolp = mFace->getPool(); + BOOL hardware_skinning = (poolp && poolp->getVertexShaderLevel() > 0) ? TRUE : FALSE; + + if (!hardware_skinning && terse_update) + { //no need to do terse updates if we're doing software vertex skinning + // since mMesh is being copied into mVertexBuffer every frame + return; + } + + LLFastTimer t(FTM_AVATAR_FACE); LLStrider<LLVector3> verticesp; @@ -696,108 +711,52 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w // Copy data into the faces from the polymesh data. if (mMesh && mValid) { - if (mMesh->getNumVertices()) + const U32 num_verts = mMesh->getNumVertices(); + + if (num_verts) { - stop_glerror(); face->getGeometryAvatar(verticesp, normalsp, tex_coordsp, vertex_weightsp, clothing_weightsp); - stop_glerror(); face->mVertexBuffer->getIndexStrider(indicesp); - stop_glerror(); verticesp += mMesh->mFaceVertexOffset; - tex_coordsp += mMesh->mFaceVertexOffset; normalsp += mMesh->mFaceVertexOffset; - vertex_weightsp += mMesh->mFaceVertexOffset; - clothing_weightsp += mMesh->mFaceVertexOffset; - - const U32* __restrict coords = (U32*) mMesh->getCoords(); - const U32* __restrict tex_coords = (U32*) mMesh->getTexCoords(); - const U32* __restrict normals = (U32*) mMesh->getNormals(); - const U32* __restrict weights = (U32*) mMesh->getWeights(); - const U32* __restrict cloth_weights = (U32*) mMesh->getClothingWeights(); - - const U32 num_verts = mMesh->getNumVertices(); - - U32 i = 0; - - const U32 skip = verticesp.getSkip()/sizeof(U32); + + F32* v = (F32*) verticesp.get(); + F32* n = (F32*) normalsp.get(); + + U32 words = num_verts*4; - U32* __restrict v = (U32*) verticesp.get(); - U32* __restrict n = (U32*) normalsp.get(); + LLVector4a::memcpyNonAliased16(v, (F32*) mMesh->getCoords(), words); + LLVector4a::memcpyNonAliased16(n, (F32*) mMesh->getNormals(), words); + - if (terse_update) + if (!terse_update) { - for (S32 i = num_verts; i > 0; --i) - { - //morph target application only, only update positions and normals - v[0] = coords[0]; - v[1] = coords[1]; - v[2] = coords[2]; - coords += 4; - v += skip; - } + vertex_weightsp += mMesh->mFaceVertexOffset; + clothing_weightsp += mMesh->mFaceVertexOffset; + tex_coordsp += mMesh->mFaceVertexOffset; + + F32* tc = (F32*) tex_coordsp.get(); + F32* vw = (F32*) vertex_weightsp.get(); + F32* cw = (F32*) clothing_weightsp.get(); - for (S32 i = num_verts; i > 0; --i) - { - n[0] = normals[0]; - n[1] = normals[1]; - n[2] = normals[2]; - normals += 4; - n += skip; - } + LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2); + LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts); + LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4); } - else - { - - U32* __restrict tc = (U32*) tex_coordsp.get(); - U32* __restrict vw = (U32*) vertex_weightsp.get(); - U32* __restrict cw = (U32*) clothing_weightsp.get(); - - do - { - v[0] = coords[0]; - v[1] = coords[1]; - v[2] = coords[2]; - coords += 4; - v += skip; - - tc[0] = *(tex_coords++); - tc[1] = *(tex_coords++); - tc += skip; - - n[0] = normals[0]; - n[1] = normals[1]; - n[2] = normals[2]; - normals += 4; - n += skip; - - vw[0] = *(weights++); - vw += skip; - - cw[0] = *(cloth_weights++); - cw[1] = *(cloth_weights++); - cw[2] = *(cloth_weights++); - cw[3] = *(cloth_weights++); - cw += skip; - } - while (++i < num_verts); - - const U32 idx_count = mMesh->getNumFaces()*3; - indicesp += mMesh->mFaceIndexOffset; + const U32 idx_count = mMesh->getNumFaces()*3; - U16* __restrict idx = indicesp.get(); - S32* __restrict src_idx = (S32*) mMesh->getFaces(); + indicesp += mMesh->mFaceIndexOffset; - i = 0; + U16* __restrict idx = indicesp.get(); + S32* __restrict src_idx = (S32*) mMesh->getFaces(); - const S32 offset = (S32) mMesh->mFaceVertexOffset; + const S32 offset = (S32) mMesh->mFaceVertexOffset; - do - { - *(idx++) = *(src_idx++)+offset; - } - while (++i < idx_count); + for (S32 i = 0; i < idx_count; ++i) + { + *(idx++) = *(src_idx++)+offset; } } } @@ -824,50 +783,44 @@ void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh) buffer->getVertexStrider(o_vertices, 0); buffer->getNormalStrider(o_normals, 0); - //F32 last_weight = F32_MAX; - LLMatrix4a gBlendMat; + F32* __restrict vert = o_vertices[0].mV; + F32* __restrict norm = o_normals[0].mV; const F32* __restrict weights = mMesh->getWeights(); const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords(); const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals(); + U32 offset = mMesh->mFaceVertexOffset*4; + vert += offset; + norm += offset; + for (U32 index = 0; index < mMesh->getNumVertices(); index++) { - U32 bidx = index + mMesh->mFaceVertexOffset; - - // blend by first matrix - F32 w = weights[index]; - - //LLVector4a coord; - //coord.load4a(coords[index].mV); + // equivalent to joint = floorf(weights[index]); + S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index));
+ F32 w = weights[index] - joint; - //LLVector4a norm; - //norm.load4a(normals[index].mV); + LLMatrix4a gBlendMat; - S32 joint = llfloor(w); - w -= joint; - - if (w > 0.f) + if (w != 0.f) { - // Try to keep all the accesses to the matrix data as close - // together as possible. This function is a hot spot on the - // Mac. JC + // blend between matrices and apply gBlendMat.setLerp(gJointMatAligned[joint+0], gJointMatAligned[joint+1], w); LLVector4a res; gBlendMat.affineTransform(coords[index], res); - o_vertices[bidx].setVec(res[0], res[1], res[2]); + res.store4a(vert+index*4); gBlendMat.rotate(normals[index], res); - o_normals[bidx].setVec(res[0], res[1], res[2]); + res.store4a(norm+index*4); } else { // No lerp required in this case. LLVector4a res; gJointMatAligned[joint].affineTransform(coords[index], res); - o_vertices[bidx].setVec(res[0], res[1], res[2]); + res.store4a(vert+index*4); gJointMatAligned[joint].rotate(normals[index], res); - o_normals[bidx].setVec(res[0], res[1], res[2]); + res.store4a(norm+index*4); } } |