我有一个函数,它获取一个字符并检查它,然后返回另一个字符(取决于接收到的字符)。
我使用(开关)检查提供的字符并返回我们想要的,但我需要更多的速度,所以我也使用(SSE2)。
我的SSE2功能比开关功能慢1.5倍。为什么?我的SSE2函数有什么慢的,以及gcc -O3如何实现switch这是如此快?

char
switch_func(char c) {
    switch (c) {
        case '0':
            return 0x40;
        case '1':
            return 0x41;
        case '2':
            return 0x42;
        case '3':
            return 0x43;
        case '4':
            return 0x44;
        case '5':
            return 0x45;
        case '6':
            return 0x46;
        case '7':
            return 0x47;
        case '8':
            return 0x48;
        case '9':
            return 0x49;
        case 'a':
            return 0x4a;
        case 'b':
            return 0x4b;
        case 'c':
            return 0x4c;
        case 'd':
            return 0x4d;
        case 'e':
            return 0x4e;
        case 'f':
            return 0x4f;
        default:
            return 0x00;
    }
}

以及SSE2函数->
char
SSE2_func(char c) {

    __m128i vec0 = _mm_set_epi8('f','e','d','c','b','a','9',
            '8','7','6','5','4','3','2','1','0');
    __m128i vec1 = _mm_set1_epi8(c);

    static char list[] = {
            0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f
    };

    vec1 = _mm_cmpeq_epi8(vec0, vec1); // Compare to find (c) in (vec0) list

    int x;
    if((x = _mm_movemask_epi8(vec1)) != 0) {
        if((x = __builtin_ctz(x)) < 16) { // x is the position of (c) character in (list[])
            return list[__builtin_ctz(x)];
        }
    }
    return 0x00;
}

GCC编译器:(-O3-msse2)

最佳答案

大多数编译器都会将您的开关转换为查找表或跳转表,就好像它类似于以下代码一样:

char lut_func(char c){
    static const char lut[256] = {
        ['0']=0x40, ['1']=0x41, ['2']=0x42, ['3']=0x43,
        ['4']=0x44, ['5']=0x45, ['6']=0x46, ['7']=0x47,
        ['8']=0x48, ['9']=0x49, ['a']=0x4a, ['b']=0x4b,
        ['c']=0x4c, ['d']=0x4d, ['e']=0x4e, ['f']=0x4f,
        /* everything else is set to 0 automatically */
    };
    return  lut[(unsigned char)c];
}

唯一的问题是:
无法矢量化
普通人?数据(0-9,a-f)跨越2个64字节的数据缓存线
您可以通过正确对齐和偏移数据(如果您分析代码,编译器可能会这样做)来修复缓存线未命中,例如:
char lut_func(char c){
    static const char __attribute__((aligned(64)))lut_data[256+16] = {
        ['0'+16]=0x40, ['1'+16]=0x41, ['2'+16]=0x42, ['3'+16]=0x43,
        ['4'+16]=0x44, ['5'+16]=0x45, ['6'+16]=0x46, ['7'+16]=0x47,
        ['8'+16]=0x48, ['9'+16]=0x49, ['a'+16]=0x4a, ['b'+16]=0x4b,
        ['c'+16]=0x4c, ['d'+16]=0x4d, ['e'+16]=0x4e, ['f'+16]=0x4f,
        /* everything else is set to 0 automatically */
    };
    char lut = lut_data+16;
    return  lut[(unsigned char)c];
}

很难说这是否会有多大帮助,因为既不包括数据的构成,也不包括基准。
手工编写的SSE2代码(尽管很聪明)不幸地包含了非SSE2代码,这会减慢代码的速度,并使自动矢量化(特别是当您仅限于SSE2时,__builtin_ctzif和char数组访问)变得困难。在数据已经“热”的情况下,这比单个数据访问效率要低。如果不经常调用SSE2版本,那么使用它仍然是值得的,但是如果是这样的话,就不需要优化它了。
如果可以按顺序访问数据,则可以使用向量扩展来获取SIMD代码,如下所示:
//this vector extension syntax requires gcc or clang versions 5+
typedef __INT8_TYPE__ i8x16 __attribute__ ((__vector_size__ (16), aligned(16), __may_alias__));
i8x16 vec_func(i8x16 c){
    i8x16 is09 = (c>='0') & (c<='9');
    i8x16 isaf = (c>='a') & (c<='f');
    return (c & (is09 | isaf)) + (16 & is09) - (23 & isaf);
}

在具有SIMD指令(x86_64、arm+neon、ppc+altivec等)的体系结构上编译,这将编译到大约20条指令,并访问大约80个字节的数据来计算16个顺序字符(使用AVX2,只需稍加修改就可以完成32个)
例如,使用通用x86_64编译会产生:
vec_func:                                   # @lu16
    movdqa  xmm1, xmm0
    pcmpgtb xmm1, xmmword ptr [rip + .LCPI0_0]
    movdqa  xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
    pcmpgtb xmm2, xmm0
    movdqa  xmm3, xmm0
    pcmpgtb xmm3, xmmword ptr [rip + .LCPI0_2]
    pand    xmm2, xmm1
    movdqa  xmm1, xmmword ptr [rip + .LCPI0_3] # xmm1 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
    pcmpgtb xmm1, xmm0
    pand    xmm1, xmm3
    movdqa  xmm3, xmm2
    por     xmm3, xmm1
    pand    xmm3, xmm0
    pand    xmm2, xmmword ptr [rip + .LCPI0_4]
    pand    xmm1, xmmword ptr [rip + .LCPI0_5]
    por     xmm1, xmm2
    paddb   xmm1, xmm3
    movdqa  xmm0, xmm1
    ret

或启用AVX2
vec_func:
    vpcmpgtb        xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
    vmovdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
    vpcmpgtb        xmm2, xmm2, xmm0
    vpcmpgtb        xmm3, xmm0, xmmword ptr [rip + .LCPI0_2]
    vpand   xmm1, xmm1, xmm2
    vmovdqa xmm2, xmmword ptr [rip + .LCPI0_3] # xmm2 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
    vpcmpgtb        xmm2, xmm2, xmm0
    vpand   xmm2, xmm3, xmm2
    vpor    xmm3, xmm1, xmm2
    vpand   xmm0, xmm3, xmm0
    vpand   xmm1, xmm1, xmmword ptr [rip + .LCPI0_4]
    vpand   xmm2, xmm2, xmmword ptr [rip + .LCPI0_5]
    vpor    xmm1, xmm2, xmm1
    vpaddb  xmm0, xmm1, xmm0
    ret

和aarch64
vec_func:
    movi    v2.16b, 0x61
    movi    v4.16b, 0x66
    movi    v1.16b, 0x30
    movi    v5.16b, 0x39
    cmge    v3.16b, v0.16b, v2.16b
    cmge    v2.16b, v4.16b, v0.16b
    cmge    v1.16b, v0.16b, v1.16b
    cmge    v5.16b, v5.16b, v0.16b
    movi    v4.16b, 0x10
    and     v2.16b, v3.16b, v2.16b
    and     v1.16b, v1.16b, v5.16b
    movi    v5.16b, 0x17
    and     v3.16b, v1.16b, v4.16b
    orr     v1.16b, v1.16b, v2.16b
    and     v2.16b, v2.16b, v5.16b
    and     v1.16b, v1.16b, v0.16b
    add     v1.16b, v1.16b, v3.16b
    sub     v0.16b, v1.16b, v2.16b
    ret

或电源9
vec_func:
    xxspltib 35, 47
    xxspltib 36, 58
    vcmpgtsb 3, 2, 3
    vcmpgtsb 4, 4, 2
    xxland 0, 35, 36
    xxspltib 35, 96
    xxspltib 36, 103
    vcmpgtsb 3, 2, 3
    vcmpgtsb 4, 4, 2
    xxland 1, 35, 36
    xxlor 2, 0, 1
    xxlxor 3, 3, 3
    xxsel 34, 3, 34, 2
    xxspltib 2, 16
    xxsel 35, 3, 2, 0
    xxspltib 0, 233
    xxsel 36, 3, 0, 1
    xxlor 35, 36, 35
    vaddubm 2, 3, 2
    blr

10-07 20:10