From 05a23f8dbaa45c64bcf6c55dd09a468ba2b1f144 Mon Sep 17 00:00:00 2001 From: Dave Parks Date: Fri, 21 May 2010 04:49:12 -0500 Subject: Vectorized memcpy. 16-byte aligned vertex buffers. (almost) fully vectorized avatar vertex buffer updating --- index buffers still need to be vectorized --- indra/newview/lldrawpoolavatar.cpp | 4 +- indra/newview/llpolymesh.cpp | 25 ++++-- indra/newview/llviewerjointmesh.cpp | 169 +++++++++++++----------------------- 3 files changed, 79 insertions(+), 119 deletions(-) (limited to 'indra/newview') diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp index d1f4be71f5..1e9053239d 100644 --- a/indra/newview/lldrawpoolavatar.cpp +++ b/indra/newview/lldrawpoolavatar.cpp @@ -1542,7 +1542,7 @@ LLVertexBufferAvatar::LLVertexBufferAvatar() void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const { - if (sRenderingSkinned) +/* if (sRenderingSkinned) { U8* base = useVBOs() ? NULL : mMappedData; @@ -1562,7 +1562,7 @@ void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const set_vertex_clothing_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_CLOTHING], mStride, (LLVector4*)(base + mOffsets[TYPE_CLOTHWEIGHT])); } } - else + else*/ { LLVertexBuffer::setupVertexBuffer(data_mask); } diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp index b8bdbfb2f8..98c0191397 100644 --- a/indra/newview/llpolymesh.cpp +++ b/indra/newview/llpolymesh.cpp @@ -140,7 +140,7 @@ void LLPolyMeshSharedData::freeMeshData() delete [] mDetailTexCoords; mDetailTexCoords = NULL; - delete [] mWeights; + _mm_free(mWeights); mWeights = NULL; } @@ -230,7 +230,7 @@ BOOL LLPolyMeshSharedData::allocateVertexData( U32 numVertices ) mBaseBinormals = new LLVector3[ numVertices ]; mTexCoords = new LLVector2[ numVertices ]; mDetailTexCoords = new LLVector2[ numVertices ]; - mWeights = new F32[ numVertices ]; + mWeights = (F32*) _mm_malloc((numVertices*sizeof(F32)+0xF) & ~0xF, 16); for (i = 0; i < numVertices; i++) { mWeights[i] = 0.f; @@ -717,13 +717,20 @@ LLPolyMesh::LLPolyMesh(LLPolyMeshSharedData *shared_data, LLPolyMesh *reference_ //use aligned vertex data to make LLPolyMesh SSE friendly mVertexData = (F32*) _mm_malloc(nfloats*4, 16); int offset = 0; - mCoords = (LLVector4*)(mVertexData + offset); offset += 4*nverts; - mNormals = (LLVector4*)(mVertexData + offset); offset += 4*nverts; - mScaledNormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; - mBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; - mScaledBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; - mTexCoords = (LLVector2*)(mVertexData + offset); offset += 2*nverts; - mClothingWeights = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + + //all members must be 16-byte aligned except the last 3 + mCoords = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + mNormals = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + mClothingWeights = (LLVector4*)(mVertexData + offset); offset += 4*nverts; + mTexCoords = (LLVector2*)(mVertexData + offset); offset += 2*nverts; + + // these members don't need to be 16-byte aligned, but the first one might be + // read during an aligned memcpy of mTexCoords + mScaledNormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; + mBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; + mScaledBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts; + + #else mCoords = new LLVector3[mSharedData->mNumVertices]; mNormals = new LLVector3[mSharedData->mNumVertices]; diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp index 236ad98d68..a7e7bfadd6 100644 --- a/indra/newview/llviewerjointmesh.cpp +++ b/indra/newview/llviewerjointmesh.cpp @@ -655,6 +655,9 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy) //----------------------------------------------------------------------------- void LLViewerJointMesh::updateFaceSizes(U32 &num_vertices, U32& num_indices, F32 pixel_area) { + //bump num_vertices to next multiple of 4 + num_vertices = (num_vertices + 0x3) & ~0x3; + // Do a pre-alloc pass to determine sizes of data. if (mMesh && mValid) { @@ -677,6 +680,8 @@ static LLFastTimer::DeclareTimer FTM_AVATAR_FACE("Avatar Face"); void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_wind, bool terse_update) { + //IF THIS FUNCTION BREAKS, SEE LLPOLYMESH CONSTRUCTOR AND CHECK ALIGNMENT OF INPUT ARRAYS + mFace = face; if (mFace->mVertexBuffer.isNull()) @@ -684,6 +689,16 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w return; } + LLDrawPool *poolp = mFace->getPool(); + BOOL hardware_skinning = (poolp && poolp->getVertexShaderLevel() > 0) ? TRUE : FALSE; + + if (!hardware_skinning && terse_update) + { //no need to do terse updates if we're doing software vertex skinning + // since mMesh is being copied into mVertexBuffer every frame + return; + } + + LLFastTimer t(FTM_AVATAR_FACE); LLStrider verticesp; @@ -696,108 +711,52 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w // Copy data into the faces from the polymesh data. if (mMesh && mValid) { - if (mMesh->getNumVertices()) + const U32 num_verts = mMesh->getNumVertices(); + + if (num_verts) { - stop_glerror(); face->getGeometryAvatar(verticesp, normalsp, tex_coordsp, vertex_weightsp, clothing_weightsp); - stop_glerror(); face->mVertexBuffer->getIndexStrider(indicesp); - stop_glerror(); verticesp += mMesh->mFaceVertexOffset; - tex_coordsp += mMesh->mFaceVertexOffset; normalsp += mMesh->mFaceVertexOffset; - vertex_weightsp += mMesh->mFaceVertexOffset; - clothing_weightsp += mMesh->mFaceVertexOffset; - - const U32* __restrict coords = (U32*) mMesh->getCoords(); - const U32* __restrict tex_coords = (U32*) mMesh->getTexCoords(); - const U32* __restrict normals = (U32*) mMesh->getNormals(); - const U32* __restrict weights = (U32*) mMesh->getWeights(); - const U32* __restrict cloth_weights = (U32*) mMesh->getClothingWeights(); - - const U32 num_verts = mMesh->getNumVertices(); - - U32 i = 0; - - const U32 skip = verticesp.getSkip()/sizeof(U32); + + F32* v = (F32*) verticesp.get(); + F32* n = (F32*) normalsp.get(); + + U32 words = num_verts*4; - U32* __restrict v = (U32*) verticesp.get(); - U32* __restrict n = (U32*) normalsp.get(); + LLVector4a::memcpyNonAliased16(v, (F32*) mMesh->getCoords(), words); + LLVector4a::memcpyNonAliased16(n, (F32*) mMesh->getNormals(), words); + - if (terse_update) + if (!terse_update) { - for (S32 i = num_verts; i > 0; --i) - { - //morph target application only, only update positions and normals - v[0] = coords[0]; - v[1] = coords[1]; - v[2] = coords[2]; - coords += 4; - v += skip; - } + vertex_weightsp += mMesh->mFaceVertexOffset; + clothing_weightsp += mMesh->mFaceVertexOffset; + tex_coordsp += mMesh->mFaceVertexOffset; + + F32* tc = (F32*) tex_coordsp.get(); + F32* vw = (F32*) vertex_weightsp.get(); + F32* cw = (F32*) clothing_weightsp.get(); - for (S32 i = num_verts; i > 0; --i) - { - n[0] = normals[0]; - n[1] = normals[1]; - n[2] = normals[2]; - normals += 4; - n += skip; - } + LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2); + LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts); + LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4); } - else - { - - U32* __restrict tc = (U32*) tex_coordsp.get(); - U32* __restrict vw = (U32*) vertex_weightsp.get(); - U32* __restrict cw = (U32*) clothing_weightsp.get(); - - do - { - v[0] = coords[0]; - v[1] = coords[1]; - v[2] = coords[2]; - coords += 4; - v += skip; - - tc[0] = *(tex_coords++); - tc[1] = *(tex_coords++); - tc += skip; - - n[0] = normals[0]; - n[1] = normals[1]; - n[2] = normals[2]; - normals += 4; - n += skip; - - vw[0] = *(weights++); - vw += skip; - - cw[0] = *(cloth_weights++); - cw[1] = *(cloth_weights++); - cw[2] = *(cloth_weights++); - cw[3] = *(cloth_weights++); - cw += skip; - } - while (++i < num_verts); - - const U32 idx_count = mMesh->getNumFaces()*3; - indicesp += mMesh->mFaceIndexOffset; + const U32 idx_count = mMesh->getNumFaces()*3; - U16* __restrict idx = indicesp.get(); - S32* __restrict src_idx = (S32*) mMesh->getFaces(); + indicesp += mMesh->mFaceIndexOffset; - i = 0; + U16* __restrict idx = indicesp.get(); + S32* __restrict src_idx = (S32*) mMesh->getFaces(); - const S32 offset = (S32) mMesh->mFaceVertexOffset; + const S32 offset = (S32) mMesh->mFaceVertexOffset; - do - { - *(idx++) = *(src_idx++)+offset; - } - while (++i < idx_count); + for (S32 i = 0; i < idx_count; ++i) + { + *(idx++) = *(src_idx++)+offset; } } } @@ -824,50 +783,44 @@ void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh) buffer->getVertexStrider(o_vertices, 0); buffer->getNormalStrider(o_normals, 0); - //F32 last_weight = F32_MAX; - LLMatrix4a gBlendMat; + F32* __restrict vert = o_vertices[0].mV; + F32* __restrict norm = o_normals[0].mV; const F32* __restrict weights = mMesh->getWeights(); const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords(); const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals(); + U32 offset = mMesh->mFaceVertexOffset*4; + vert += offset; + norm += offset; + for (U32 index = 0; index < mMesh->getNumVertices(); index++) { - U32 bidx = index + mMesh->mFaceVertexOffset; - - // blend by first matrix - F32 w = weights[index]; - - //LLVector4a coord; - //coord.load4a(coords[index].mV); + // equivalent to joint = floorf(weights[index]); + S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index)); + F32 w = weights[index] - joint; - //LLVector4a norm; - //norm.load4a(normals[index].mV); + LLMatrix4a gBlendMat; - S32 joint = llfloor(w); - w -= joint; - - if (w > 0.f) + if (w != 0.f) { - // Try to keep all the accesses to the matrix data as close - // together as possible. This function is a hot spot on the - // Mac. JC + // blend between matrices and apply gBlendMat.setLerp(gJointMatAligned[joint+0], gJointMatAligned[joint+1], w); LLVector4a res; gBlendMat.affineTransform(coords[index], res); - o_vertices[bidx].setVec(res[0], res[1], res[2]); + res.store4a(vert+index*4); gBlendMat.rotate(normals[index], res); - o_normals[bidx].setVec(res[0], res[1], res[2]); + res.store4a(norm+index*4); } else { // No lerp required in this case. LLVector4a res; gJointMatAligned[joint].affineTransform(coords[index], res); - o_vertices[bidx].setVec(res[0], res[1], res[2]); + res.store4a(vert+index*4); gJointMatAligned[joint].rotate(normals[index], res); - o_normals[bidx].setVec(res[0], res[1], res[2]); + res.store4a(norm+index*4); } } -- cgit v1.3 From e90d2f88e5ce584b52b24315c85845a9e5113b50 Mon Sep 17 00:00:00 2001 From: Dave Parks Date: Fri, 21 May 2010 14:31:17 -0500 Subject: Aligned index buffers. --- indra/llrender/llvertexbuffer.cpp | 56 +++++++++++--------------------------- indra/llrender/llvertexbuffer.h | 3 +- indra/newview/lldrawpoolavatar.cpp | 19 +++++++------ 3 files changed, 29 insertions(+), 49 deletions(-) (limited to 'indra/newview') diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp index a50eb7211c..0f3c900d2b 100644 --- a/indra/llrender/llvertexbuffer.cpp +++ b/indra/llrender/llvertexbuffer.cpp @@ -453,6 +453,7 @@ LLVertexBuffer::LLVertexBuffer(U32 typemask, S32 usage) : mTypeMask = typemask; mStride = stride; mAlignedOffset = 0; + mAlignedIndexOffset = 0; sCount++; } @@ -642,16 +643,20 @@ void LLVertexBuffer::createGLIndices() mEmpty = TRUE; + //pad by 16 bytes for aligned copies + size += 16; + if (useVBOs()) { + //pad by another 16 bytes for VBO pointer adjustment + size += 16; mMappedIndexData = NULL; genIndices(); mResized = TRUE; } else { - mMappedIndexData = new U8[size]; - memset(mMappedIndexData, 0, size); + mMappedIndexData = (U8*) _mm_malloc(size, 16); static int gl_buffer_idx = 0; mGLIndices = ++gl_buffer_idx; } @@ -699,7 +704,7 @@ void LLVertexBuffer::destroyGLIndices() } else { - delete [] mMappedIndexData; + _mm_free(mMappedIndexData); mMappedIndexData = NULL; mEmpty = TRUE; } @@ -836,26 +841,10 @@ void LLVertexBuffer::resizeBuffer(S32 newnverts, S32 newnindices) } else { - //delete old buffer, keep GL buffer for now if (!useVBOs()) { - U8* old = mMappedData; - mMappedData = new U8[newsize]; - if (old) - { - memcpy(mMappedData, old, llmin(newsize, oldsize)); - if (newsize > oldsize) - { - memset(mMappedData+oldsize, 0, newsize-oldsize); - } - - delete [] old; - } - else - { - memset(mMappedData, 0, newsize); - mEmpty = TRUE; - } + _mm_free(mMappedData); + mMappedData = (U8*) _mm_malloc(newsize, 16); } mResized = TRUE; } @@ -875,24 +864,8 @@ void LLVertexBuffer::resizeBuffer(S32 newnverts, S32 newnindices) { if (!useVBOs()) { - //delete old buffer, keep GL buffer for now - U8* old = mMappedIndexData; - mMappedIndexData = new U8[new_index_size]; - - if (old) - { - memcpy(mMappedIndexData, old, llmin(new_index_size, old_index_size)); - if (new_index_size > old_index_size) - { - memset(mMappedIndexData+old_index_size, 0, new_index_size - old_index_size); - } - delete [] old; - } - else - { - memset(mMappedIndexData, 0, new_index_size); - mEmpty = TRUE; - } + _mm_free(mMappedIndexData); + mMappedIndexData = (U8*) _mm_malloc(new_index_size, 16); } mResized = TRUE; } @@ -958,7 +931,10 @@ U8* LLVertexBuffer::mapBuffer(S32 access) } { LLMemType mt_v(LLMemType::MTYPE_VERTEX_MAP_BUFFER_INDICES); - mMappedIndexData = (U8*) glMapBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB); + U8* src = (U8*) glMapBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB); + mMappedIndexData = LL_NEXT_ALIGNED_ADDRESS(src); + mAlignedIndexOffset = mMappedIndexData - src; + stop_glerror(); } diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h index 03799af978..c6fd0a9e3c 100644 --- a/indra/llrender/llvertexbuffer.h +++ b/indra/llrender/llvertexbuffer.h @@ -187,7 +187,7 @@ public: S32 getRequestedVerts() const { return mRequestedNumVerts; } S32 getRequestedIndices() const { return mRequestedNumIndices; } - U8* getIndicesPointer() const { return useVBOs() ? NULL : mMappedIndexData; } + U8* getIndicesPointer() const { return useVBOs() ? (U8*) mAlignedIndexOffset : mMappedIndexData; } U8* getVerticesPointer() const { return useVBOs() ? NULL : mMappedData; } S32 getStride() const { return mStride; } U32 getTypeMask() const { return mTypeMask; } @@ -214,6 +214,7 @@ protected: S32 mRequestedNumIndices; // Number of indices requested ptrdiff_t mAlignedOffset; + ptrdiff_t mAlignedIndexOffset; S32 mStride; U32 mTypeMask; S32 mUsage; // GL usage diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp index 1e9053239d..4fb8f5266e 100644 --- a/indra/newview/lldrawpoolavatar.cpp +++ b/indra/newview/lldrawpoolavatar.cpp @@ -1542,27 +1542,30 @@ LLVertexBufferAvatar::LLVertexBufferAvatar() void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const { -/* if (sRenderingSkinned) + if (sRenderingSkinned) { U8* base = useVBOs() ? NULL : mMappedData; - glVertexPointer(3,GL_FLOAT, mStride, (void*)(base + 0)); - glNormalPointer(GL_FLOAT, mStride, (void*)(base + mOffsets[TYPE_NORMAL])); - glTexCoordPointer(2,GL_FLOAT, mStride, (void*)(base + mOffsets[TYPE_TEXCOORD0])); + glVertexPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_VERTEX], (void*)(base + 0)); + glNormalPointer(GL_FLOAT, LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_NORMAL], (void*)(base + mOffsets[TYPE_NORMAL])); + glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_TEXCOORD0], (void*)(base + mOffsets[TYPE_TEXCOORD0])); - set_vertex_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_WEIGHT], mStride, (F32*)(base + mOffsets[TYPE_WEIGHT])); + set_vertex_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_WEIGHT], + LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_WEIGHT], (F32*)(base + mOffsets[TYPE_WEIGHT])); if (sShaderLevel >= LLDrawPoolAvatar::SHADER_LEVEL_BUMP) { - set_binormals(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::BINORMAL], mStride, (LLVector3*)(base + mOffsets[TYPE_BINORMAL])); + set_binormals(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::BINORMAL], + LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_BINORMAL], (LLVector3*)(base + mOffsets[TYPE_BINORMAL])); } if (sShaderLevel >= LLDrawPoolAvatar::SHADER_LEVEL_CLOTH) { - set_vertex_clothing_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_CLOTHING], mStride, (LLVector4*)(base + mOffsets[TYPE_CLOTHWEIGHT])); + set_vertex_clothing_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_CLOTHING], + LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_CLOTHWEIGHT], (LLVector4*)(base + mOffsets[TYPE_CLOTHWEIGHT])); } } - else*/ + else { LLVertexBuffer::setupVertexBuffer(data_mask); } -- cgit v1.3