summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Parks <davep@lindenlab.com>2010-05-21 04:49:12 -0500
committerDave Parks <davep@lindenlab.com>2010-05-21 04:49:12 -0500
commit05a23f8dbaa45c64bcf6c55dd09a468ba2b1f144 (patch)
tree2e4ad0ae03f1c48e1a325501ebdb96bd843fde67
parentbf5f215fbc29102cfd8b5418f29ea0ed6edd14ee (diff)
Vectorized memcpy.
16-byte aligned vertex buffers. (almost) fully vectorized avatar vertex buffer updating --- index buffers still need to be vectorized
-rw-r--r--indra/llrender/llvertexbuffer.cpp195
-rw-r--r--indra/llrender/llvertexbuffer.h7
-rw-r--r--indra/newview/lldrawpoolavatar.cpp4
-rw-r--r--indra/newview/llpolymesh.cpp25
-rw-r--r--indra/newview/llviewerjointmesh.cpp169
5 files changed, 273 insertions, 127 deletions
diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 7fa47cd171..a50eb7211c 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -39,6 +39,7 @@
#include "llglheaders.h"
#include "llmemtype.h"
#include "llrender.h"
+#include "llvector4a.h"
//============================================================================
@@ -66,6 +67,27 @@ S32 LLVertexBuffer::sWeight4Loc = -1;
std::vector<U32> LLVertexBuffer::sDeleteList;
+#define LL_ALIGNED_VB 1
+
+#if LL_ALIGNED_VB
+
+S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] =
+{
+ sizeof(LLVector4), // TYPE_VERTEX,
+ sizeof(LLVector4), // TYPE_NORMAL,
+ sizeof(LLVector2), // TYPE_TEXCOORD0,
+ sizeof(LLVector2), // TYPE_TEXCOORD1,
+ sizeof(LLVector2), // TYPE_TEXCOORD2,
+ sizeof(LLVector2), // TYPE_TEXCOORD3,
+ sizeof(LLColor4U), // TYPE_COLOR,
+ sizeof(LLVector4), // TYPE_BINORMAL,
+ sizeof(F32), // TYPE_WEIGHT,
+ sizeof(LLVector4), // TYPE_WEIGHT4,
+ sizeof(LLVector4), // TYPE_CLOTHWEIGHT,
+};
+
+#else
+
S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] =
{
sizeof(LLVector3), // TYPE_VERTEX,
@@ -81,6 +103,8 @@ S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] =
sizeof(LLVector4), // TYPE_CLOTHWEIGHT,
};
+#endif
+
U32 LLVertexBuffer::sGLMode[LLRender::NUM_MODES] =
{
GL_TRIANGLES,
@@ -428,11 +452,41 @@ LLVertexBuffer::LLVertexBuffer(U32 typemask, S32 usage) :
mTypeMask = typemask;
mStride = stride;
+ mAlignedOffset = 0;
+
sCount++;
}
+#if LL_ALIGNED_VB
+//static
+S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets, S32 num_vertices)
+{
+ S32 offset = 0;
+ for (S32 i=0; i<TYPE_MAX; i++)
+ {
+ U32 mask = 1<<i;
+ if (typemask & mask)
+ {
+ if (offsets)
+ {
+ offsets[i] = offset;
+ offset += LLVertexBuffer::sTypeOffsets[i]*num_vertices;
+ offset = (offset + 0xF) & ~0xF;
+ }
+ }
+ }
+
+ return offset+16;
+}
+
+S32 LLVertexBuffer::getSize() const
+{
+ return mStride;
+}
+
+#else
//static
-S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets)
+S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets, S32 num_vertices)
{
S32 stride = 0;
for (S32 i=0; i<TYPE_MAX; i++)
@@ -451,6 +505,12 @@ S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets)
return stride;
}
+S32 LLVertexBuffer::getSize() const
+{
+ return mNumVerts*mStride;
+}
+
+#endif
// protected, use unref()
//virtual
LLVertexBuffer::~LLVertexBuffer()
@@ -560,7 +620,7 @@ void LLVertexBuffer::createGLBuffer()
{
static int gl_buffer_idx = 0;
mGLBuffer = ++gl_buffer_idx;
- mMappedData = new U8[size];
+ mMappedData = (U8*) _mm_malloc(size, 16);
memset(mMappedData, 0, size);
}
}
@@ -612,7 +672,7 @@ void LLVertexBuffer::destroyGLBuffer()
}
else
{
- delete [] mMappedData;
+ _mm_free(mMappedData);
mMappedData = NULL;
mEmpty = TRUE;
}
@@ -664,7 +724,7 @@ void LLVertexBuffer::updateNumVerts(S32 nverts)
}
mRequestedNumVerts = nverts;
-
+
if (!mDynamicSize)
{
mNumVerts = nverts;
@@ -679,6 +739,9 @@ void LLVertexBuffer::updateNumVerts(S32 nverts)
}
mNumVerts = nverts;
}
+#if LL_ALIGNED_VB
+ mStride = calcStride(mTypeMask, mOffsets, mNumVerts);
+#endif
}
@@ -886,7 +949,11 @@ U8* LLVertexBuffer::mapBuffer(S32 access)
setBuffer(0);
mLocked = TRUE;
stop_glerror();
- mMappedData = (U8*) glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
+
+ U8* src = (U8*) glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
+ mMappedData = LL_NEXT_ALIGNED_ADDRESS<U8>(src);
+ mAlignedOffset = mMappedData - src;
+
stop_glerror();
}
{
@@ -975,6 +1042,45 @@ void LLVertexBuffer::unmapBuffer()
//----------------------------------------------------------------------------
+#if LL_ALIGNED_VB
+
+template <class T,S32 type> struct VertexBufferStrider
+{
+ typedef LLStrider<T> strider_t;
+ static bool get(LLVertexBuffer& vbo,
+ strider_t& strider,
+ S32 index)
+ {
+ if (vbo.mapBuffer() == NULL)
+ {
+ llwarns << "mapBuffer failed!" << llendl;
+ return FALSE;
+ }
+
+ if (type == LLVertexBuffer::TYPE_INDEX)
+ {
+ S32 stride = sizeof(T);
+ strider = (T*)(vbo.getMappedIndices() + index*stride);
+ strider.setStride(0);
+ return TRUE;
+ }
+ else if (vbo.hasDataType(type))
+ {
+ S32 stride = LLVertexBuffer::sTypeOffsets[type];
+ strider = (T*)(vbo.getMappedData() + vbo.getOffset(type)+index*stride);
+ strider.setStride(stride);
+ return TRUE;
+ }
+ else
+ {
+ llerrs << "VertexBufferStrider could not find valid vertex data." << llendl;
+ }
+ return FALSE;
+ }
+};
+
+#else
+
template <class T,S32 type> struct VertexBufferStrider
{
typedef LLStrider<T> strider_t;
@@ -1010,6 +1116,7 @@ template <class T,S32 type> struct VertexBufferStrider
}
};
+#endif
bool LLVertexBuffer::getVertexStrider(LLStrider<LLVector3>& strider, S32 index)
{
@@ -1272,6 +1379,82 @@ void LLVertexBuffer::setBuffer(U32 data_mask)
}
}
+#if LL_ALIGNED_VB
+
+// virtual (default)
+void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const
+{
+ LLMemType mt2(LLMemType::MTYPE_VERTEX_SETUP_VERTEX_BUFFER);
+ stop_glerror();
+ U8* base = useVBOs() ? (U8*) mAlignedOffset : mMappedData;
+
+ if ((data_mask & mTypeMask) != data_mask)
+ {
+ llerrs << "LLVertexBuffer::setupVertexBuffer missing required components for supplied data mask." << llendl;
+ }
+
+
+ if (data_mask & MAP_NORMAL)
+ {
+ glNormalPointer(GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_NORMAL], (void*)(base + mOffsets[TYPE_NORMAL]));
+ }
+ if (data_mask & MAP_TEXCOORD3)
+ {
+ glClientActiveTextureARB(GL_TEXTURE3_ARB);
+ glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD3], (void*)(base + mOffsets[TYPE_TEXCOORD3]));
+ glClientActiveTextureARB(GL_TEXTURE0_ARB);
+ }
+ if (data_mask & MAP_TEXCOORD2)
+ {
+ glClientActiveTextureARB(GL_TEXTURE2_ARB);
+ glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD2], (void*)(base + mOffsets[TYPE_TEXCOORD2]));
+ glClientActiveTextureARB(GL_TEXTURE0_ARB);
+ }
+ if (data_mask & MAP_TEXCOORD1)
+ {
+ glClientActiveTextureARB(GL_TEXTURE1_ARB);
+ glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD1], (void*)(base + mOffsets[TYPE_TEXCOORD1]));
+ glClientActiveTextureARB(GL_TEXTURE0_ARB);
+ }
+ if (data_mask & MAP_BINORMAL)
+ {
+ glClientActiveTextureARB(GL_TEXTURE2_ARB);
+ glTexCoordPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_BINORMAL], (void*)(base + mOffsets[TYPE_BINORMAL]));
+ glClientActiveTextureARB(GL_TEXTURE0_ARB);
+ }
+ if (data_mask & MAP_TEXCOORD0)
+ {
+ glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD0], (void*)(base + mOffsets[TYPE_TEXCOORD0]));
+ }
+ if (data_mask & MAP_COLOR)
+ {
+ glColorPointer(4, GL_UNSIGNED_BYTE, LLVertexBuffer::sTypeOffsets[TYPE_COLOR], (void*)(base + mOffsets[TYPE_COLOR]));
+ }
+
+ if (data_mask & MAP_WEIGHT)
+ {
+ glVertexAttribPointerARB(1, 1, GL_FLOAT, FALSE, LLVertexBuffer::sTypeOffsets[TYPE_WEIGHT], (void*)(base + mOffsets[TYPE_WEIGHT]));
+ }
+
+ if (data_mask & MAP_WEIGHT4 && sWeight4Loc != -1)
+ {
+ glVertexAttribPointerARB(sWeight4Loc, 4, GL_FLOAT, FALSE, LLVertexBuffer::sTypeOffsets[TYPE_WEIGHT4], (void*)(base+mOffsets[TYPE_WEIGHT4]));
+ }
+
+ if (data_mask & MAP_CLOTHWEIGHT)
+ {
+ glVertexAttribPointerARB(4, 4, GL_FLOAT, TRUE, LLVertexBuffer::sTypeOffsets[TYPE_CLOTHWEIGHT], (void*)(base + mOffsets[TYPE_CLOTHWEIGHT]));
+ }
+ if (data_mask & MAP_VERTEX)
+ {
+ glVertexPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_VERTEX], (void*)(base + 0));
+ }
+
+ llglassertok();
+}
+
+#else
+
// virtual (default)
void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const
{
@@ -1344,6 +1527,8 @@ void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const
llglassertok();
}
+#endif
+
void LLVertexBuffer::markDirty(U32 vert_index, U32 vert_count, U32 indices_index, U32 indices_count)
{
// TODO: use GL_APPLE_flush_buffer_range here
diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h
index d1700aa54a..03799af978 100644
--- a/indra/llrender/llvertexbuffer.h
+++ b/indra/llrender/llvertexbuffer.h
@@ -98,7 +98,7 @@ public:
//if offsets is not NULL, its contents will be filled
//with the offset of each vertex component in the buffer,
// indexed by the following enum
- static S32 calcStride(const U32& typemask, S32* offsets = NULL);
+ static S32 calcStride(const U32& typemask, S32* offsets = NULL, S32 num_vertices = 0);
enum {
TYPE_VERTEX,
@@ -192,7 +192,7 @@ public:
S32 getStride() const { return mStride; }
U32 getTypeMask() const { return mTypeMask; }
BOOL hasDataType(S32 type) const { return ((1 << type) & getTypeMask()) ? TRUE : FALSE; }
- S32 getSize() const { return mNumVerts*mStride; }
+ S32 getSize() const;
S32 getIndicesSize() const { return mNumIndices * sizeof(U16); }
U8* getMappedData() const { return mMappedData; }
U8* getMappedIndices() const { return mMappedIndexData; }
@@ -213,6 +213,7 @@ protected:
S32 mRequestedNumVerts; // Number of vertices requested
S32 mRequestedNumIndices; // Number of indices requested
+ ptrdiff_t mAlignedOffset;
S32 mStride;
U32 mTypeMask;
S32 mUsage; // GL usage
@@ -227,7 +228,7 @@ protected:
S32 mOffsets[TYPE_MAX];
BOOL mResized; // if TRUE, client buffer has been resized and GL buffer has not
BOOL mDynamicSize; // if TRUE, buffer has been resized at least once (and should be padded)
-
+
class DirtyRegion
{
public:
diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp
index d1f4be71f5..1e9053239d 100644
--- a/indra/newview/lldrawpoolavatar.cpp
+++ b/indra/newview/lldrawpoolavatar.cpp
@@ -1542,7 +1542,7 @@ LLVertexBufferAvatar::LLVertexBufferAvatar()
void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const
{
- if (sRenderingSkinned)
+/* if (sRenderingSkinned)
{
U8* base = useVBOs() ? NULL : mMappedData;
@@ -1562,7 +1562,7 @@ void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const
set_vertex_clothing_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_CLOTHING], mStride, (LLVector4*)(base + mOffsets[TYPE_CLOTHWEIGHT]));
}
}
- else
+ else*/
{
LLVertexBuffer::setupVertexBuffer(data_mask);
}
diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp
index b8bdbfb2f8..98c0191397 100644
--- a/indra/newview/llpolymesh.cpp
+++ b/indra/newview/llpolymesh.cpp
@@ -140,7 +140,7 @@ void LLPolyMeshSharedData::freeMeshData()
delete [] mDetailTexCoords;
mDetailTexCoords = NULL;
- delete [] mWeights;
+ _mm_free(mWeights);
mWeights = NULL;
}
@@ -230,7 +230,7 @@ BOOL LLPolyMeshSharedData::allocateVertexData( U32 numVertices )
mBaseBinormals = new LLVector3[ numVertices ];
mTexCoords = new LLVector2[ numVertices ];
mDetailTexCoords = new LLVector2[ numVertices ];
- mWeights = new F32[ numVertices ];
+ mWeights = (F32*) _mm_malloc((numVertices*sizeof(F32)+0xF) & ~0xF, 16);
for (i = 0; i < numVertices; i++)
{
mWeights[i] = 0.f;
@@ -717,13 +717,20 @@ LLPolyMesh::LLPolyMesh(LLPolyMeshSharedData *shared_data, LLPolyMesh *reference_
//use aligned vertex data to make LLPolyMesh SSE friendly
mVertexData = (F32*) _mm_malloc(nfloats*4, 16);
int offset = 0;
- mCoords = (LLVector4*)(mVertexData + offset); offset += 4*nverts;
- mNormals = (LLVector4*)(mVertexData + offset); offset += 4*nverts;
- mScaledNormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts;
- mBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts;
- mScaledBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts;
- mTexCoords = (LLVector2*)(mVertexData + offset); offset += 2*nverts;
- mClothingWeights = (LLVector4*)(mVertexData + offset); offset += 4*nverts;
+
+ //all members must be 16-byte aligned except the last 3
+ mCoords = (LLVector4*)(mVertexData + offset); offset += 4*nverts;
+ mNormals = (LLVector4*)(mVertexData + offset); offset += 4*nverts;
+ mClothingWeights = (LLVector4*)(mVertexData + offset); offset += 4*nverts;
+ mTexCoords = (LLVector2*)(mVertexData + offset); offset += 2*nverts;
+
+ // these members don't need to be 16-byte aligned, but the first one might be
+ // read during an aligned memcpy of mTexCoords
+ mScaledNormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts;
+ mBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts;
+ mScaledBinormals = (LLVector3*)(mVertexData + offset); offset += 3*nverts;
+
+
#else
mCoords = new LLVector3[mSharedData->mNumVertices];
mNormals = new LLVector3[mSharedData->mNumVertices];
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index 236ad98d68..a7e7bfadd6 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -655,6 +655,9 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
//-----------------------------------------------------------------------------
void LLViewerJointMesh::updateFaceSizes(U32 &num_vertices, U32& num_indices, F32 pixel_area)
{
+ //bump num_vertices to next multiple of 4
+ num_vertices = (num_vertices + 0x3) & ~0x3;
+
// Do a pre-alloc pass to determine sizes of data.
if (mMesh && mValid)
{
@@ -677,6 +680,8 @@ static LLFastTimer::DeclareTimer FTM_AVATAR_FACE("Avatar Face");
void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_wind, bool terse_update)
{
+ //IF THIS FUNCTION BREAKS, SEE LLPOLYMESH CONSTRUCTOR AND CHECK ALIGNMENT OF INPUT ARRAYS
+
mFace = face;
if (mFace->mVertexBuffer.isNull())
@@ -684,6 +689,16 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
return;
}
+ LLDrawPool *poolp = mFace->getPool();
+ BOOL hardware_skinning = (poolp && poolp->getVertexShaderLevel() > 0) ? TRUE : FALSE;
+
+ if (!hardware_skinning && terse_update)
+ { //no need to do terse updates if we're doing software vertex skinning
+ // since mMesh is being copied into mVertexBuffer every frame
+ return;
+ }
+
+
LLFastTimer t(FTM_AVATAR_FACE);
LLStrider<LLVector3> verticesp;
@@ -696,108 +711,52 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
// Copy data into the faces from the polymesh data.
if (mMesh && mValid)
{
- if (mMesh->getNumVertices())
+ const U32 num_verts = mMesh->getNumVertices();
+
+ if (num_verts)
{
- stop_glerror();
face->getGeometryAvatar(verticesp, normalsp, tex_coordsp, vertex_weightsp, clothing_weightsp);
- stop_glerror();
face->mVertexBuffer->getIndexStrider(indicesp);
- stop_glerror();
verticesp += mMesh->mFaceVertexOffset;
- tex_coordsp += mMesh->mFaceVertexOffset;
normalsp += mMesh->mFaceVertexOffset;
- vertex_weightsp += mMesh->mFaceVertexOffset;
- clothing_weightsp += mMesh->mFaceVertexOffset;
-
- const U32* __restrict coords = (U32*) mMesh->getCoords();
- const U32* __restrict tex_coords = (U32*) mMesh->getTexCoords();
- const U32* __restrict normals = (U32*) mMesh->getNormals();
- const U32* __restrict weights = (U32*) mMesh->getWeights();
- const U32* __restrict cloth_weights = (U32*) mMesh->getClothingWeights();
-
- const U32 num_verts = mMesh->getNumVertices();
-
- U32 i = 0;
-
- const U32 skip = verticesp.getSkip()/sizeof(U32);
+
+ F32* v = (F32*) verticesp.get();
+ F32* n = (F32*) normalsp.get();
+
+ U32 words = num_verts*4;
- U32* __restrict v = (U32*) verticesp.get();
- U32* __restrict n = (U32*) normalsp.get();
+ LLVector4a::memcpyNonAliased16(v, (F32*) mMesh->getCoords(), words);
+ LLVector4a::memcpyNonAliased16(n, (F32*) mMesh->getNormals(), words);
+
- if (terse_update)
+ if (!terse_update)
{
- for (S32 i = num_verts; i > 0; --i)
- {
- //morph target application only, only update positions and normals
- v[0] = coords[0];
- v[1] = coords[1];
- v[2] = coords[2];
- coords += 4;
- v += skip;
- }
+ vertex_weightsp += mMesh->mFaceVertexOffset;
+ clothing_weightsp += mMesh->mFaceVertexOffset;
+ tex_coordsp += mMesh->mFaceVertexOffset;
+
+ F32* tc = (F32*) tex_coordsp.get();
+ F32* vw = (F32*) vertex_weightsp.get();
+ F32* cw = (F32*) clothing_weightsp.get();
- for (S32 i = num_verts; i > 0; --i)
- {
- n[0] = normals[0];
- n[1] = normals[1];
- n[2] = normals[2];
- normals += 4;
- n += skip;
- }
+ LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2);
+ LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts);
+ LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4);
}
- else
- {
-
- U32* __restrict tc = (U32*) tex_coordsp.get();
- U32* __restrict vw = (U32*) vertex_weightsp.get();
- U32* __restrict cw = (U32*) clothing_weightsp.get();
-
- do
- {
- v[0] = coords[0];
- v[1] = coords[1];
- v[2] = coords[2];
- coords += 4;
- v += skip;
-
- tc[0] = *(tex_coords++);
- tc[1] = *(tex_coords++);
- tc += skip;
-
- n[0] = normals[0];
- n[1] = normals[1];
- n[2] = normals[2];
- normals += 4;
- n += skip;
-
- vw[0] = *(weights++);
- vw += skip;
-
- cw[0] = *(cloth_weights++);
- cw[1] = *(cloth_weights++);
- cw[2] = *(cloth_weights++);
- cw[3] = *(cloth_weights++);
- cw += skip;
- }
- while (++i < num_verts);
-
- const U32 idx_count = mMesh->getNumFaces()*3;
- indicesp += mMesh->mFaceIndexOffset;
+ const U32 idx_count = mMesh->getNumFaces()*3;
- U16* __restrict idx = indicesp.get();
- S32* __restrict src_idx = (S32*) mMesh->getFaces();
+ indicesp += mMesh->mFaceIndexOffset;
- i = 0;
+ U16* __restrict idx = indicesp.get();
+ S32* __restrict src_idx = (S32*) mMesh->getFaces();
- const S32 offset = (S32) mMesh->mFaceVertexOffset;
+ const S32 offset = (S32) mMesh->mFaceVertexOffset;
- do
- {
- *(idx++) = *(src_idx++)+offset;
- }
- while (++i < idx_count);
+ for (S32 i = 0; i < idx_count; ++i)
+ {
+ *(idx++) = *(src_idx++)+offset;
}
}
}
@@ -824,50 +783,44 @@ void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
buffer->getVertexStrider(o_vertices, 0);
buffer->getNormalStrider(o_normals, 0);
- //F32 last_weight = F32_MAX;
- LLMatrix4a gBlendMat;
+ F32* __restrict vert = o_vertices[0].mV;
+ F32* __restrict norm = o_normals[0].mV;
const F32* __restrict weights = mMesh->getWeights();
const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords();
const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals();
+ U32 offset = mMesh->mFaceVertexOffset*4;
+ vert += offset;
+ norm += offset;
+
for (U32 index = 0; index < mMesh->getNumVertices(); index++)
{
- U32 bidx = index + mMesh->mFaceVertexOffset;
-
- // blend by first matrix
- F32 w = weights[index];
-
- //LLVector4a coord;
- //coord.load4a(coords[index].mV);
+ // equivalent to joint = floorf(weights[index]);
+ S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index));
+ F32 w = weights[index] - joint;
- //LLVector4a norm;
- //norm.load4a(normals[index].mV);
+ LLMatrix4a gBlendMat;
- S32 joint = llfloor(w);
- w -= joint;
-
- if (w > 0.f)
+ if (w != 0.f)
{
- // Try to keep all the accesses to the matrix data as close
- // together as possible. This function is a hot spot on the
- // Mac. JC
+ // blend between matrices and apply
gBlendMat.setLerp(gJointMatAligned[joint+0],
gJointMatAligned[joint+1], w);
LLVector4a res;
gBlendMat.affineTransform(coords[index], res);
- o_vertices[bidx].setVec(res[0], res[1], res[2]);
+ res.store4a(vert+index*4);
gBlendMat.rotate(normals[index], res);
- o_normals[bidx].setVec(res[0], res[1], res[2]);
+ res.store4a(norm+index*4);
}
else
{ // No lerp required in this case.
LLVector4a res;
gJointMatAligned[joint].affineTransform(coords[index], res);
- o_vertices[bidx].setVec(res[0], res[1], res[2]);
+ res.store4a(vert+index*4);
gJointMatAligned[joint].rotate(normals[index], res);
- o_normals[bidx].setVec(res[0], res[1], res[2]);
+ res.store4a(norm+index*4);
}
}