本文介绍了相比于裸__m128 SSE矢量包装类型的表现的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我发现一个有趣的,其中规定这是不可能达到的纯性能 __ M128 键入与封装类型。嗯,我是持怀疑态度,所以我下载的项目文件,编造一个可比的测试用例。

原来(我惊奇)的包装版本是显著慢。因为我不想谈论刚才的空气稀薄,测试案例如下:

在第一种情况 Vec4 是与 __ M128 键入一个简单的别名一些运营商:

 的#include< xmmintrin.h>
#包括LT&;&emmintrin.h GT;使用Vec4 = __m128;内嵌__m128 VLOAD(浮动六)
{
    返回_mm_set_ps(F,F,F,F);
};内嵌Vec4&安培;运算符+ =(Vec4&安培; VA,VB Vec4)
{
    回报(VA = _mm_add_ps(VA,VB));
};内嵌Vec4&安培;符* =(Vec4&安培; VA,VB Vec4)
{
    回报(VA = _mm_mul_ps(VA,VB));
};内嵌Vec4运营商+(Vec4 VA,VB Vec4)
{
    返回_mm_add_ps(VA,VB);
};内嵌Vec4操作符(Vec4 VA,VB Vec4)
{
    返回_mm_sub_ps(VA,VB);
};内嵌Vec4符*(Vec4 VA,VB Vec4)
{
    返回_mm_mul_ps(VA,VB);
};

在第二种情况 Vec4 约为一个轻量级的包装__ M128
这不是一个完整的包装,只是一个小品覆盖问题。经营者裹一模一样内部函数,唯一的区别是(因为不能在参数应用于16字节对齐),他们采取 Vec4 常量引用:

 的#include< xmmintrin.h>
#包括LT&;&emmintrin.h GT;结构Vec4
{
    __m128 SIMD;    内嵌Vec4()=默认值;
    内嵌Vec4(常量Vec4&安培)=默认值;
    内嵌Vec4&安培;运算符=(常量Vec4&安培)=默认值;    内嵌Vec4(__ M128 S)
        :SIMD(S)
    {}    内嵌运营商__m128()const的
    {
        返回SIMD;
    }    内嵌运营商__m128及()
    {
        返回SIMD;
    }
};内嵌__m128 VLOAD(浮动六)
{
    返回_mm_set_ps(F,F,F,F);
};内嵌Vec4 VADD(常量Vec4&放大器; VA,常量Vec4和放大器; VB)
{
    返回_mm_add_ps(VA,VB);
    //返回_mm_add_ps(va.simd,vb.simd); //不作区别
};内嵌Vec4 VSUB(常量Vec4&放大器; VA,常量Vec4和放大器; VB)
{
    返回_mm_sub_ps(VA,VB);
    //返回_mm_sub_ps(va.simd,vb.simd); //不作区别
};内嵌Vec4 VMul(常量Vec4&放大器; VA,常量Vec4和放大器; VB)
{
    返回_mm_mul_ps(VA,VB);
    //返回_mm_mul_ps(va.simd,vb.simd); //不作区别
};

这里是测试内核产生不同的版本不同的性能 Vec4

 的#include< xmmintrin.h>
#包括LT&;&emmintrin.h GT;结构EQSTATE
{
    //过滤器#1(低频段)    Vec4 LF; // 频率
    Vec4 f1p0; //波兰人...
    Vec4 f1p1;
    Vec4 f1p2;
    Vec4 f1p3;    //滤波器#2(高频段)    Vec4 HF; // 频率
    Vec4 f2p0; //波兰人...
    Vec4 f2p1;
    Vec4 f2p2;
    Vec4 f2p3;    //样品历史缓冲区    Vec4 sdm1; //采样数据减去1
    Vec4 SDM2; // 2
    Vec4 sdm3; // 3    //增益控制    Vec4 LG; //低增益
    Vec4毫克; //中旬增益
    Vec4汞柱; //高增益};静浮vsaf =(1.0F / 4294967295.0f); //极少量(非规格化修复)
静态Vec4 VSA = VLOAD(vsaf);Vec4 TestEQ(EQSTATE * ES,Vec4&安培;样品)
{
    //当地人    Vec4 L,M,H; //低/中/高 - 采样值    //过滤器#1(低通)    ES-GT&; f1p0 + =(ES> LF *(样本 - 上课 - > f1p0))+ VSA;
    //第ES> f1p0 = VADD(ES> f1p0,VADD(VMul(ES> LF,VSUB(样品,第ES> f1p0)),VSA));    ES-GT&; f1p1 + =(ES> LF *(ES> f1p0 - 上课 - > f1p1));
    // ES-> f1p1 = VADD(ES-> f1p1,VMul(ES-> LF,VSUB(ES-> f1p0,ES-> f1p1)));    ES-GT&; f1p2 + =(ES> LF *(ES> f1p1 - 上课 - > f1p2));
    // ES-> f1p2 = VADD(ES-> f1p2,VMul(ES-> LF,VSUB(ES-> f1p1,ES-> f1p2)));    ES-GT&; f1p3 + =(ES> LF *(ES> f1p2 - 上课 - > f1p3));
    // ES-> f1p3 = VADD(ES-> f1p3,VMul(ES-> LF,VSUB(ES-> f1p2,ES-> f1p3)));    L = ES-GT&; f1p3;    //滤波器#2(高通)    ES-GT&; f2p0 + =(ES> HF *(样本 - 上课 - > f2p0))+ VSA;
    //第ES> f2p0 = VADD(ES> f2p0,VADD(VMul(ES> HF,VSUB(样品,第ES> f2p0)),VSA));    ES-GT&; f2p1 + =(ES> HF *(ES> f2p0 - 上课 - > f2p1));
    // ES-> f2p1 = VADD(ES-> f2p1,VMul(ES-> HF,VSUB(ES-> f2p0,ES-> f2p1)));    ES-GT&; f2p2 + =(ES> HF *(ES> f2p1 - 上课 - > f2p2));
    // ES-> f2p2 = VADD(ES-> f2p2,VMul(ES-> HF,VSUB(ES-> f2p1,ES-> f2p2)));    ES-GT&; f2p3 + =(ES> HF *(ES> f2p2 - 上课 - > f2p3));
    // ES-> f2p3 = VADD(ES-> f2p3,VMul(ES-> HF,VSUB(ES-> f2p2,ES-> f2p3)));    H = ES-GT&; sdm3 - 上课 - > f2p3;
    // H = VSUB(ES> sdm3,ES-GT&; f2p3);    //计算中端(信号 - (低+高))    M = ES-> sdm3 - (H + L);
    // M = VSUB(ES-> sdm3,VADD(H,L));    //规模,合并和存储    L * = ES-GT&; LG;
    M * = ES-GT&;毫克;
    H * = ES-GT&;汞柱;    // L = VMul(L,ES-> LG);
    // M = VMul(男,第ES>毫克);
    // H = VMul(H,第ES>汞柱);    //随机播放历史缓冲区    ES-GT&; sdm3 = ES-GT&; SDM2;
    ES-GT&; SDM2 = ES-GT&; sdm1;
    ES-GT&; sdm1 =样本;    //返回结果    返回(L + M + H);
    //回报(VADD(L,VADD(M,H)));
}//使这些为全局强制执行的函数调用;
静态Vec4样本[1024]结果[1024];
静态EQSTATE ES;#包括LT&;&计时GT;
#包括LT&;&iostream的GT;诠释的main()
{
    汽车T0 =的std ::时辰:: high_resolution_clock ::现在();    为(中间体二= 0; II蛋白酶1024;ⅱ++)
    {
        结果[Ⅱ] = TestEQ(安培; ES,样品[II]);
    }    汽车T1 =的std ::时辰:: high_resolution_clock ::现在();
    自动T =的std ::时辰:: duration_cast<的std ::时辰::纳秒>(T1 - T0).Count之间的();
    性病::法院LT&;< 时间:<< T<<的'\\ n';    的std :: cin.get();    返回0;
}


链接工作code




2015年MSVC生成的汇编为第1版

 ; COMDAT?TestEQ @@ YA?AT__m128 @@ PAUEQSTATE @@ AAT1 @@ž
_TEXT段
?TestEQ @@ YA AT__m128 @@ PAUEQSTATE @@ AAT1 @@žPROC; TestEQ,COMDAT
; _es $死$ = ECX
; _Sample $ = EDX
    vmovaps XMM0,XMMWORD PTR [EDX]
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
    vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@一
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3T__m128 @@一
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
    vmulps XMM0,XMM0,XMM2
    vaddps XMM4,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
    vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 80
    vmovaps将xmm1,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64,XMM4
    vmovaps XMM0,XMMWORD PTR [EDX]
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3T__m128 @@一
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
    vsubps XMM2,将xmm1,XMM0
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144,XMM0
    vmovaps XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192,XMM0
    vmovaps XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176,XMM0
    vmovaps XMM0,XMMWORD PTR [EDX]
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160,XMM0
    vaddps XMM0,XMM4,XMM2
    vsubps XMM0,xmm1中,XMM0
    vmulps将xmm1,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 224
    vmulps XMM0,XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 240
    vaddps将xmm1,xmm1中,XMM0
    vmulps XMM0,XMM4,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 208
    vaddps XMM0,xmm1中,XMM0
    RET 0
?TestEQ @@ YA AT__m128 @@ PAUEQSTATE @@ AAT1 @@žENDP?; TestEQ

2015年MSVC产生组装在第二版

TestEQ @@ YA AUVec4 @ VMATH @@ PAUEQSTATE @@ AAU12 @@žPROC; TestEQ,COMDAT
; ___ $ ReturnUdt $ = ECX
; _es $死$ = EDX
    推EBX
    MOV EBX,ESP
    子ESP,8
    和ESP,-8; fffffff8H
    ADD ESP,4
    推EBP
    MOV EBP,DWORD PTR [EBX + 4]
    MOV EAX,DWORD PTR _Sample $ [EBX]
    vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@一
    vmovaps将xmm1,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192
    MOV DWORD PTR [ESP + 4],EBP
    vmovaps XMM0,XMMWORD PTR [EAX]
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3UVec4 @ VMATH @@一
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
    vmulps XMM0,XMM0,XMM2
    vaddps XMM4,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
    vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 80
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64,XMM4
    vmovaps XMM0,XMMWORD PTR [EAX]
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3UVec4 @ VMATH @@一
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128,XMM0
    vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
    vmulps XMM0,XMM0,XMM2
    vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
    vsubps XMM2,将xmm1,XMM0
    vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144,XMM0
    vaddps XMM0,XMM2,XMM4
    vsubps XMM0,xmm1中,XMM0
    vmulps将xmm1,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 224
    vmovdqu XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176
    vmovdqu XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192,XMM0
    vmovdqu XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160
    vmovdqu XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176,XMM0
    vmovdqu XMM0,XMMWORD PTR [EAX]
    vmovdqu XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160,XMM0
    vmulps XMM0,XMM4,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 208
    vaddps将xmm1,XMM0,xmm1中
    vmulps XMM0,XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 240
    vaddps XMM0,xmm1中,XMM0
    vmovaps XMMWORD PTR [ECX],XMM0
    MOV EAX,ECX
    流行EBP
    MOV ESP,EBX
    流行EBX
    RET 0
?TestEQ @@ YA AUVec4 @ VMATH @@ PAUEQSTATE @@ AAU12 @@žENDP?; TestEQ

的第2版的生产装配是显著更长和更慢。它不严格相关的的Visual Studio 的,因为的锵3.8 的产生类似的性能结果。


锵3.8在第一个版本生成的程序集

 ?TestEQ @@ YAT__m128 @@ PAUEQSTATE @@ AAT1 @@ Z:#@?\\ 01 TestEQ @@ YAT__m128 @@ PAUEQSTATE @@ AAT1 @@ Z
Lfunc_begin0:
Ltmp0:
#BB#0:#%进入
    MOVL 8(%ESP),EAX%
    MOVL 4(%尤),%ecx中
    vmovaps _vsa,%XMM0
    vmovaps(ECX%),%将xmm1
    vmovaps 16(%ECX),%XMM2
    vmovaps(%EAX),%XMM3
    vsubps%XMM2,XMM3%,%XMM3
    vmulps%XMM3,%将xmm1,%XMM3
    vaddps%XMM3,%XMM0,%XMM3
    vaddps%XMM3,XMM2%,%XMM2
    vmovaps%XMM2,16(%ECX)
    vmovaps 32(%ECX),%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps%XMM2,%将xmm1,%XMM2
    vaddps%XMM2,XMM3%,%XMM2
    vmovaps%XMM2,32(%ECX)
    vmovaps 48(%ECX),%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps%XMM2,%将xmm1,%XMM2
    vaddps%XMM2,XMM3%,%XMM2
    vmovaps%XMM2,48(%ECX)
    vmovaps 64(%ECX),%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps%XMM2,%将xmm1,xmm1中的%
    vaddps%将xmm1,%XMM3,xmm1中的%
    vmovaps%将xmm1,64(%ECX)
    vmovaps 80(%ECX),%XMM2
    vmovaps 96(%ECX),%XMM3
    vmovaps(%EAX),%XMM4
    vsubps%XMM3,%XMM4,%XMM4
    vmulps%XMM4,%XMM2,XMM4%
    vaddps%XMM4,%XMM0,%XMM0
    vaddps%XMM0,%XMM3,%XMM0
    vmovaps%XMM0,96(%ECX)
    vmovaps 112(%ECX),%XMM3
    vsubps%XMM3,%XMM0,%XMM0
    vmulps%XMM0,%XMM2,%XMM0
    vaddps%XMM0,%XMM3,%XMM0
    vmovaps%XMM0,112(ECX%)
    vmovaps 128(%ECX),%XMM3
    vsubps%XMM3,%XMM0,%XMM0
    vmulps%XMM0,%XMM2,%XMM0
    vaddps%XMM0,%XMM3,%XMM0
    vmovaps%XMM0,128(%ECX)
    vmovaps 144(%ECX),%XMM3
    vsubps%XMM3,%XMM0,%XMM0
    vmulps%XMM0,%XMM2,%XMM0
    vaddps%XMM0,%XMM3,%XMM0
    vmovaps%XMM0,144(ECX%)
    vmovaps 192(ECX%),%XMM2
    vsubps%XMM0,%XMM2,%XMM0
    vaddps%XMM0,xmm1中的%,%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps 208(ECX%),%将xmm1,xmm1中的%
    vmulps 224(ECX%),%XMM2,%XMM2
    vmulps 240(%ECX),%XMM0,%XMM0
    vmovaps 176(ECX%),%XMM3
    vmovaps%XMM3,192(ECX%)
    vmovaps 160(%ECX),%XMM3
    vmovaps%XMM3,176(ECX%)
    vmovaps(%EAX),%XMM3
    vmovaps%XMM3,160(ECX%)
    vaddps%XMM2,%XMM0,%XMM0
    vaddps%XMM0,xmm1中的%,%XMM0
    RETL
Lfunc_end0:

锵3.8在第二版生成的程序集

 ?TestEQ @@ YA AUVec4 @@ PAUEQSTATE @@ AAU1 @@ Z:?#@\\ 01 TestEQ @@ YA AUVec4 @@ PAUEQSTATE @@ AAU1 @ @Z
Lfunc_begin0:
Ltmp0:
#BB#0:#%进入
    MOVL 12(%尤),%ecx中
    MOVL 8(%ESP),EDX%
    vmovaps(%EDX),%XMM0
    vmovaps 16(%EDX),xmm1中的%
    vmovaps(ECX%),%XMM2
    vsubps%将xmm1,%XMM2,%XMM2
    vmulps%XMM0,%XMM2,%XMM2
    vaddps _vsa,%XMM2,%XMM2
    vaddps%XMM2,%将xmm1,xmm1中的%
    vmovaps%将xmm1,16​​(%EDX)
    vmovaps 32(%EDX),%XMM2
    vsubps%XMM2,%将xmm1,xmm1中的%
    vmulps%XMM0,xmm1中的%,%将xmm1
    vaddps%将xmm1,%XMM2,xmm1中的%
    vmovaps%将xmm1,32(%EDX)
    vmovaps 48(%EDX),%XMM2
    vsubps%XMM2,%将xmm1,xmm1中的%
    vmulps%XMM0,xmm1中的%,%将xmm1
    vaddps%将xmm1,%XMM2,xmm1中的%
    vmovaps%将xmm1,48(%EDX)
    vmovaps 64(%EDX),%XMM2
    vsubps%XMM2,%将xmm1,xmm1中的%
    vmulps%XMM0,xmm1中的%,%XMM0
    vaddps%XMM0,%XMM2,%XMM0
    vmovaps%XMM0,64(%EDX)
    vmovaps 80(%EDX),xmm1中的%
    vmovaps 96(%EDX),%XMM2
    vmovaps(%ECX),%XMM3
    vsubps%XMM2,XMM3%,%XMM3
    vmulps%将xmm1,%XMM3,%XMM3
    vaddps _vsa,%XMM3,%XMM3
    vaddps%XMM3,XMM2%,%XMM2
    vmovaps%XMM2,96(%EDX)
    vmovaps 112(%EDX),%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps%将xmm1,%XMM2,%XMM2
    vaddps%XMM2,XMM3%,%XMM2
    vmovaps%XMM2,112(%EDX)
    vmovaps 128(%EDX),%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps%将xmm1,%XMM2,%XMM2
    vaddps%XMM2,XMM3%,%XMM2
    vmovaps%XMM2,128(%EDX)
    vmovaps 144(%EDX),%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps%将xmm1,%XMM2,xmm1中的%
    vaddps%将xmm1,%XMM3,xmm1中的%
    vmovaps%将xmm1,144(%EDX)
    vmovaps 192(%EDX),%XMM2
    vsubps%将xmm1,%XMM2,xmm1中的%
    vaddps%将xmm1,%XMM0,%XMM3
    vsubps%XMM3,XMM2%,%XMM2
    vmulps 208(%EDX),%XMM0,%XMM0
    vmulps 224(%EDX),%XMM2,%XMM2
    MOVL 4(%ESP),EAX%
    vmulps 240(%EDX),%将xmm1,xmm1中的%
    vmovaps 176(%EDX),%XMM3
    vmovaps%XMM3,192(%EDX)
    vmovaps 160(%EDX),%XMM3
    vmovaps%XMM3,176(%EDX)
    vmovaps(%ECX),%XMM3
    vmovaps%XMM3,160(%EDX)
    vaddps%XMM2,%XMM0,%XMM0
    vaddps%XMM0,xmm1中的%,%XMM0
    vmovaps%XMM0,(%EAX)
    RETL
Lfunc_end0:

虽然指令的数目是相同的,第1次版本仍然快约50%的


我试图找出问题的原因,没有成功。有喜欢那些难看的 vmovdqu 指令在第二MSVC装配可疑的东西。建设,拷贝赋值操作符和传递按引用也可能会不必要地将数据从SSE寄存器回内存,但是我所有的努力来解决或准确识别的问题是不成功的。

我真的不认为这样一个简单的包装不能达到相同的性能裸 __ M128 ,无论使其可以消除的开销。

那么,什么是对那里发生的?


解决方案

由于原来的问题是不符合规定的用户结构Vec4
它深深地涉及到x86的调用约定。

默认的x86调用约定在Visual C ++是 __ CDECL ,其中

Now this is a problem, since Vec4 should be kept and passed in an XMM register. But let's see what is actually happening.


1st case

In the first case Vec4 is a simple type alias of __m128.

using Vec4 = __m128;
/* ... */
Vec4 TestEQ(EQSTATE* es, Vec4 &sample) { ... }

The generated function header of TestEQ in assembly is

?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC      ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...

Nice.


2nd case

In the second case Vec4 is not an alias of __m128, it is an user defined type now.

Here I investigate compilation for both x86 and x64 platform.

x86 (32-bit compilation)

Since __cdecl (which is the default calling convention in x86) doesn't allow to pass aligned values to functions (that would emit Error C2719: 'sample': formal parameter with requested alignment of 16 won't be aligned) we pass it by const reference.

struct Vec4{ __m128 simd; /* ... */ };
/* ... */
Vec4 TestEQ(EQSTATE* es, const Vec4 &sample) { ... }

which generates the function header for TestEQ as

?TestEQ@@YA?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$ = edx
    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -8                 ; fffffff8H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov eax, DWORD PTR _sample$[ebx]
    ...

This is not so simple like the one in the 1st case. The arguments are moved to the stack. There are some additional mov instructions between the first few SSE instructions too, which are not listed here. These instructions in overall are enough to somewhat hit the performance.

x64 (64-bit compilation)

Windows in x64 use a different calling convention as part of the x64 Application Binary Interface (ABI).

This convention tries to keep the data in registers if possible, in a way that floating-point data kept in XMM registers.

From MSDN Overview of x64 Calling Conventions:

From Wikipedia page for x86-64 calling conventions

So the second case in x64 mode generates the function header for TestEQ as

?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...

This is exactly the same as the 1st case!


Solution

For x86 mode the presented behavior should be clearly fixed.

The most simple solution is to inline the function.Although this is just a hint and the compiler can completely ignore, you can tell the compiler to always inline the function. However sometimes this is not desired because of the function size or any other reason.

Fortunately Microsoft introduced the __vectorcall convention in Visual Studio 2013 and above (available in both x86 and x64 mode). This is very similar to the default Windows x64 calling convention, but with more utilizable registers.

Let's rewrite the 2nd case with __vectorcall:

Vec4 __vectorcall TestEQ(EQSTATE* es, const Vec4 &sample) { ... }

Now the generated assembly function header for TestEQ is

?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC        ; TestEQ, COMDAT
; _es$ = ecx
; _sample$ = edx
...

which is finally the same as the 1st case and the 2nd case in x64.

As Peter Cordes pointed out, to take the full advantage of __vectorcall, the Vec4 argument should be passed by value, instead of constant reference. To do this the passed type should meet some requirements, like it must be trivially copy constructible (no user defined copy constructors) and shouldn't contain any union. More info in the comments below and here.

Final words

It looks like MSVC under the hood automatically applies the __vectorcall convention as an optimization when it detects an __m128 argument. Otherwise it uses the default calling convention __cdecl (you can change this behavior by compiler options).

People told me in the comments that they didn't see much difference between the GCC and Clang generated assembly of the two case. This is because these compilers with optimization flag -O2 simply inline the TestEQ function into the test loop body (see). It is also possible that they would be more clever than MSVC and they would perform better optimization of the function call.

这篇关于相比于裸__m128 SSE矢量包装类型的表现的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

08-29 09:00