我有一个函数,它获取一个字符并检查它,然后返回另一个字符(取决于接收到的字符)。
我使用(开关)检查提供的字符并返回我们想要的,但我需要更多的速度,所以我也使用(SSE2)。
我的SSE2功能比开关功能慢1.5倍。为什么?我的SSE2函数有什么慢的,以及gcc -O3
如何实现switch
这是如此快?
char
switch_func(char c) {
switch (c) {
case '0':
return 0x40;
case '1':
return 0x41;
case '2':
return 0x42;
case '3':
return 0x43;
case '4':
return 0x44;
case '5':
return 0x45;
case '6':
return 0x46;
case '7':
return 0x47;
case '8':
return 0x48;
case '9':
return 0x49;
case 'a':
return 0x4a;
case 'b':
return 0x4b;
case 'c':
return 0x4c;
case 'd':
return 0x4d;
case 'e':
return 0x4e;
case 'f':
return 0x4f;
default:
return 0x00;
}
}
以及SSE2函数->
char
SSE2_func(char c) {
__m128i vec0 = _mm_set_epi8('f','e','d','c','b','a','9',
'8','7','6','5','4','3','2','1','0');
__m128i vec1 = _mm_set1_epi8(c);
static char list[] = {
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f
};
vec1 = _mm_cmpeq_epi8(vec0, vec1); // Compare to find (c) in (vec0) list
int x;
if((x = _mm_movemask_epi8(vec1)) != 0) {
if((x = __builtin_ctz(x)) < 16) { // x is the position of (c) character in (list[])
return list[__builtin_ctz(x)];
}
}
return 0x00;
}
GCC编译器:(-O3-msse2)
最佳答案
大多数编译器都会将您的开关转换为查找表或跳转表,就好像它类似于以下代码一样:
char lut_func(char c){
static const char lut[256] = {
['0']=0x40, ['1']=0x41, ['2']=0x42, ['3']=0x43,
['4']=0x44, ['5']=0x45, ['6']=0x46, ['7']=0x47,
['8']=0x48, ['9']=0x49, ['a']=0x4a, ['b']=0x4b,
['c']=0x4c, ['d']=0x4d, ['e']=0x4e, ['f']=0x4f,
/* everything else is set to 0 automatically */
};
return lut[(unsigned char)c];
}
唯一的问题是:
无法矢量化
普通人?数据(0-9,a-f)跨越2个64字节的数据缓存线
您可以通过正确对齐和偏移数据(如果您分析代码,编译器可能会这样做)来修复缓存线未命中,例如:
char lut_func(char c){
static const char __attribute__((aligned(64)))lut_data[256+16] = {
['0'+16]=0x40, ['1'+16]=0x41, ['2'+16]=0x42, ['3'+16]=0x43,
['4'+16]=0x44, ['5'+16]=0x45, ['6'+16]=0x46, ['7'+16]=0x47,
['8'+16]=0x48, ['9'+16]=0x49, ['a'+16]=0x4a, ['b'+16]=0x4b,
['c'+16]=0x4c, ['d'+16]=0x4d, ['e'+16]=0x4e, ['f'+16]=0x4f,
/* everything else is set to 0 automatically */
};
char lut = lut_data+16;
return lut[(unsigned char)c];
}
很难说这是否会有多大帮助,因为既不包括数据的构成,也不包括基准。
手工编写的SSE2代码(尽管很聪明)不幸地包含了非SSE2代码,这会减慢代码的速度,并使自动矢量化(特别是当您仅限于SSE2时,
__builtin_ctz
,if
和char数组访问)变得困难。在数据已经“热”的情况下,这比单个数据访问效率要低。如果不经常调用SSE2版本,那么使用它仍然是值得的,但是如果是这样的话,就不需要优化它了。如果可以按顺序访问数据,则可以使用向量扩展来获取SIMD代码,如下所示:
//this vector extension syntax requires gcc or clang versions 5+
typedef __INT8_TYPE__ i8x16 __attribute__ ((__vector_size__ (16), aligned(16), __may_alias__));
i8x16 vec_func(i8x16 c){
i8x16 is09 = (c>='0') & (c<='9');
i8x16 isaf = (c>='a') & (c<='f');
return (c & (is09 | isaf)) + (16 & is09) - (23 & isaf);
}
在具有SIMD指令(x86_64、arm+neon、ppc+altivec等)的体系结构上编译,这将编译到大约20条指令,并访问大约80个字节的数据来计算16个顺序字符(使用AVX2,只需稍加修改就可以完成32个)
例如,使用通用x86_64编译会产生:
vec_func: # @lu16
movdqa xmm1, xmm0
pcmpgtb xmm1, xmmword ptr [rip + .LCPI0_0]
movdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
pcmpgtb xmm2, xmm0
movdqa xmm3, xmm0
pcmpgtb xmm3, xmmword ptr [rip + .LCPI0_2]
pand xmm2, xmm1
movdqa xmm1, xmmword ptr [rip + .LCPI0_3] # xmm1 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
pcmpgtb xmm1, xmm0
pand xmm1, xmm3
movdqa xmm3, xmm2
por xmm3, xmm1
pand xmm3, xmm0
pand xmm2, xmmword ptr [rip + .LCPI0_4]
pand xmm1, xmmword ptr [rip + .LCPI0_5]
por xmm1, xmm2
paddb xmm1, xmm3
movdqa xmm0, xmm1
ret
或启用AVX2
vec_func:
vpcmpgtb xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
vpcmpgtb xmm2, xmm2, xmm0
vpcmpgtb xmm3, xmm0, xmmword ptr [rip + .LCPI0_2]
vpand xmm1, xmm1, xmm2
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_3] # xmm2 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
vpcmpgtb xmm2, xmm2, xmm0
vpand xmm2, xmm3, xmm2
vpor xmm3, xmm1, xmm2
vpand xmm0, xmm3, xmm0
vpand xmm1, xmm1, xmmword ptr [rip + .LCPI0_4]
vpand xmm2, xmm2, xmmword ptr [rip + .LCPI0_5]
vpor xmm1, xmm2, xmm1
vpaddb xmm0, xmm1, xmm0
ret
和aarch64
vec_func:
movi v2.16b, 0x61
movi v4.16b, 0x66
movi v1.16b, 0x30
movi v5.16b, 0x39
cmge v3.16b, v0.16b, v2.16b
cmge v2.16b, v4.16b, v0.16b
cmge v1.16b, v0.16b, v1.16b
cmge v5.16b, v5.16b, v0.16b
movi v4.16b, 0x10
and v2.16b, v3.16b, v2.16b
and v1.16b, v1.16b, v5.16b
movi v5.16b, 0x17
and v3.16b, v1.16b, v4.16b
orr v1.16b, v1.16b, v2.16b
and v2.16b, v2.16b, v5.16b
and v1.16b, v1.16b, v0.16b
add v1.16b, v1.16b, v3.16b
sub v0.16b, v1.16b, v2.16b
ret
或电源9
vec_func:
xxspltib 35, 47
xxspltib 36, 58
vcmpgtsb 3, 2, 3
vcmpgtsb 4, 4, 2
xxland 0, 35, 36
xxspltib 35, 96
xxspltib 36, 103
vcmpgtsb 3, 2, 3
vcmpgtsb 4, 4, 2
xxland 1, 35, 36
xxlor 2, 0, 1
xxlxor 3, 3, 3
xxsel 34, 3, 34, 2
xxspltib 2, 16
xxsel 35, 3, 2, 0
xxspltib 0, 233
xxsel 36, 3, 0, 1
xxlor 35, 36, 35
vaddubm 2, 3, 2
blr