float f=;
__asm
{
mov esi, this ; vector u
movups xmm0, [esi] ; first vector in xmm0
mulps xmm0, xmm0 ; mul with 2nd vector
movaps xmm1, xmm0 ; copy result
shufps xmm1, xmm1, 4Eh ; shuffle: f1,f0,f3,f2
addps xmm0, xmm1 ; add: f3+f1,f2+f0,f1+f3,f0+f2
movaps xmm1, xmm0 ; copy results
shufps xmm1, xmm1, 11h ; shuffle: f0+f2,f1+f3,f0+f2,f1+f3
addps xmm0, xmm1 ; add: x,x,f0+f1+f2+f3,f0+f1+f2+f3
sqrtss xmm0, xmm0 ; sqroot from least bit value
movss f, xmm0 ; move result from xmm0 to edi
}

x, y, z, w平方后,利用shufps将4个分量凑成相加结果为x方+y方+z方+w方,最后开方,取xmm寄存器的一个分量即是结果,取的xmm寄存器的低32位。

05-11 14:05