问题描述
您好!这是我在这里发表的第一篇文章,如果我遗漏或收集太多信息,请原谅我。
我正在尝试使用SSE卸载3D模型上的顶点变换(浮点)加快绘制模型所花费的时间。主要功能如下:
__ m128 m4x4v_colSSE(const __m128 cols [4],const __m128 v)
{
__m128 u1 = _mm_shuffle_ps( v,v,_MM_SHUFFLE(0,0,0,0));
__m128 u2 = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,1,1,1));
__m128 u3 = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,2,2,2));
__m128 u4 = _mm_shuffle_ps(v,v,_MM_SHUFFLE(3,3,3,3));
__m128 prod1 = _mm_mul_ps(u1,cols [0]);
__ m128 prod2 = _mm_mul_ps(u2,cols [1]); //< ---此行产生错误!异常抛出0x0012F167
__m128 prod3 = _mm_mul_ps(u3,cols [2]);
__m128 prod4 = _mm_mul_ps(u4,cols [3]);
返回_mm_add_ps(_mm_add_ps(prod1,prod2),_ mm_add_ps(prod3,prod4));
}
现在,当执行mat4转换时,它会从下一个函数调用:
void md5_transform_vertices_sse(MD5mesh * msh,epi :: mat4_c * posemats,basevert * dst)
{
epi :: mat4_c * mats = posemats + 1;
MD5vertex * vs = msh-> verts;
MD5weight * ws = msh->权重;
int i,j;
for(i = 0; i< msh-> vertcnt; i ++)
{
MD5vertex * v = vs + i;
basevert * cv = dst + i;
__m128 pos = _mm_set1_ps(0);
__m128 norm = _mm_set1_ps(0);
MD5weight * w = ws + v-> firstweight;
for(j = 0; j< v-> weightcnt; j ++)
{
__m128 wpos = _mm_setr_ps(w [j] .pos [0],w [j] .pos [1],w [j] .pos [2],1);
__m128 wnorm = _mm_setr_ps(w [j] .normal [0],w [j] .normal [1],w [j] .normal [2],0);
wpos = m4x4v_colSSE((__ m128 *)& mats [w [j] .jointidx],wpos);
wnorm = m4x4v_colSSE((__ m128 *)& mats [w [j] .jointidx],wnorm);
__m128 weight = _mm_set1_ps(w [j] .weight);
pos = _mm_mul_ps(_mm_add_ps(wpos,pos),weight);
norm = _mm_mul_ps(_mm_add_ps(wnorm,norm),weight);
}
_mm_store_ps((float *)& cv-> pos,pos);
_mm_store_ps((float *)& cv-> norm,norm);
}
}
当游戏引擎加载并解码MD5模型后,我立即崩溃:
EDGE.exe中0x0045F167处抛出异常:
0xC0000005:访问冲突读取位置0xFFFFFFFF。
(_m128 m4x4v_colSSE()失败)
所以,我想要收集的是这里存在某种对齐问题(也许我错了),但我似乎无法解决导致错误的原因。为了记录,引擎被编译(在VS2017中)与/ O2和/ Ob2优化,
并且它被设置为生成SSE2。
我写了md5_transform_vertices_sse ()函数代替普通的非SSE2版本(供参考):
void md5_transform_vertices(MD5mesh * msh,epi :: mat4_c * posemats,basevert * dst)
{
epi :: mat4_c * mats = posemats + 1;
MD5vertex * vs = msh-> verts;
MD5weight * ws = msh->权重;
int i,j;
for(i = 0; i< msh-> vertcnt; i ++)
{
MD5vertex * v = vs + i;
basevert * cv = dst + i;
cv-> pos = epi :: vec3_c(0,0,0);
cv-> norm = epi :: vec3_c(0,0,0);
cv-> tan = epi :: vec3_c(0,0,0);
cv-> uv = epi :: vec2_c(v-> uv [0],v-> uv [1]);
MD5weight * w = ws + v-> firstweight;
for(j = 0; j< v-> weightcnt; j ++)
{
#if PREMULTIPLY
cv-> pos + =(mats [w [j ] .jointidx] * epi :: vec4_c(w [j] .pos,w [j] .weight))。Get3D();
cv-> norm + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .normal,0))。Get3D();
cv-> tan + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .tan,0))。Get3D();
#else
cv-> pos + = mats [w [j] .jointidx] * epi :: vec3_c(w [j] .pos)* w [j] .weight;
cv-> norm + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .normal,0))。Get3D()* w [j] .weight;
cv-> tan + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .tan,0))。Get3D()* w [j] .weight;
#endif
}
}
}
尽可能看,这些计算对于具有多个/ tris / etc的统一模型来说有点贵,因此尝试SSE的主要原因是加速这些计算。引擎是用OpenGL编写的,基于原始的DOOM引擎。
任何帮助,解释或建议我都会非常感激。
再次感谢您,
-Coraline
Hello! This is my first post here so please forgive me if I leave out or put in too much information.
I'm trying to use SSE to offload vertex transformation on 3D models (floating point) to speed up the amount of time it has to spend drawing the models. The primary function is below:
__m128 m4x4v_colSSE(const __m128 cols[4], const __m128 v) { __m128 u1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); __m128 u2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); __m128 u3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)); __m128 u4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)); __m128 prod1 = _mm_mul_ps(u1, cols[0]); __m128 prod2 = _mm_mul_ps(u2, cols[1]); //<--- this line produces the error! Exception thrown at 0x0012F167 __m128 prod3 = _mm_mul_ps(u3, cols[2]); __m128 prod4 = _mm_mul_ps(u4, cols[3]); return _mm_add_ps(_mm_add_ps(prod1, prod2), _mm_add_ps(prod3, prod4)); }
Now, it gets called from the very next function, when performing mat4 transformation:
void md5_transform_vertices_sse(MD5mesh *msh, epi::mat4_c *posemats, basevert *dst) { epi::mat4_c *mats = posemats + 1; MD5vertex *vs = msh->verts; MD5weight *ws = msh->weights; int i,j; for(i = 0; i < msh->vertcnt; i++) { MD5vertex *v = vs + i; basevert *cv = dst + i; __m128 pos = _mm_set1_ps(0); __m128 norm = _mm_set1_ps(0); MD5weight *w = ws + v->firstweight;
for(j = 0; j < v->weightcnt; j++) { __m128 wpos = _mm_setr_ps(w[j].pos[0], w[j].pos[1], w[j].pos[2], 1); __m128 wnorm = _mm_setr_ps(w[j].normal[0],w[j].normal[1], w[j].normal[2], 0); wpos = m4x4v_colSSE((__m128*)&mats[w[j].jointidx], wpos); wnorm = m4x4v_colSSE((__m128*)&mats[w[j].jointidx], wnorm); __m128 weight = _mm_set1_ps(w[j].weight); pos = _mm_mul_ps(_mm_add_ps(wpos,pos),weight); norm = _mm_mul_ps(_mm_add_ps(wnorm,norm),weight); } _mm_store_ps((float*)&cv->pos, pos); _mm_store_ps((float*)&cv->norm, norm); } }
When the game engine loads and after decoding the MD5 model, I get an immediate crash:
Exception thrown at 0x0045F167 in EDGE.exe:
0xC0000005: Access violation reading location 0xFFFFFFFF.
(fails in _m128 m4x4v_colSSE())
So, what I'm trying to gather is that there is some sort of alignment issues here (and maybe I'm wrong), but I can't seem to wrap my head around what is causing the error. For the record, the engine is compiled (in VS2017) with /O2 and /Ob2 optimizations, and it is set to generate SSE2.
I have written the md5_transform_vertices_sse() function in place of the normal, non-SSE2 version (for reference):
void md5_transform_vertices(MD5mesh *msh, epi::mat4_c *posemats, basevert *dst) { epi::mat4_c *mats = posemats + 1; MD5vertex *vs = msh->verts; MD5weight *ws = msh->weights; int i,j; for(i = 0; i < msh->vertcnt; i++) { MD5vertex *v = vs + i; basevert *cv = dst + i; cv->pos = epi::vec3_c(0,0,0); cv->norm = epi::vec3_c(0,0,0); cv->tan = epi::vec3_c(0,0,0); cv->uv = epi::vec2_c(v->uv[0],v->uv[1]); MD5weight *w = ws + v->firstweight; for(j = 0; j < v->weightcnt; j++) { #if PREMULTIPLY cv->pos += (mats[w[j].jointidx] * epi::vec4_c(w[j].pos,w[j].weight)).Get3D(); cv->norm += (mats[w[j].jointidx] * epi::vec4_c(w[j].normal,0)).Get3D(); cv->tan += (mats[w[j].jointidx] * epi::vec4_c(w[j].tan,0)).Get3D(); #else cv->pos += mats[w[j].jointidx] * epi::vec3_c(w[j].pos) * w[j].weight; cv->norm += (mats[w[j].jointidx] * epi::vec4_c(w[j].normal,0)).Get3D() * w[j].weight; cv->tan += (mats[w[j].jointidx] * epi::vec4_c(w[j].tan,0)).Get3D() * w[j].weight; #endif } } }
As you can see, these calculations are a tad expensive for unified models with several pieces/tris/etc, so the primary reason to attempt SSE was to speed up these calculations. The engine is written in OpenGL and is based on the original DOOM engine.
Any help, explanations, or suggestions I would be most thankful.
Thank you again,
-Coraline
这篇关于_m128 SSE崩溃(OpenGL / C ++)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!