这是一个函数,它获取权重值src并将其存储到dst中。

static int _medium_c( DCTELEM * src, int index, int *dst )
{
    int i;
    //get weighted value
    for( i = 0; i < 16; i++ )
    {
        unsigned int threshold1 = threshold[index][i];//threshold contains constant value
        unsigned int threshold2 = ( threshold1<<1 );
        int level= src[i];
        if( ( ( unsigned )( level+threshold1 ) ) > threshold2 )
        {
            if( ( ( unsigned )( level+2*threshold1 ) ) > 2*threshold2 )
            {
                dst[i] = level * factor[i];
            }
            else
            {
                if( level>0 )
                {
                    dst[i] =  2*( level - ( int )threshold1 ) * factor[i];
                }
                else
                {
                    dst[i] =  2*( level + ( int )threshold1 ) * factor[i];
                }
            }
        }
    }
    return 0;
}


固有版本为:

int medium_intrinsic16( DCTELEM * src, int index, int* dst )
{
   int i, j = 0,  c[16], k = 0;
   for( j = 0;j < 2;j++ )
   {
        __m128i zero128 = _mm_setzero_si128();
        __m128i mask = _mm_set_epi8( 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x0d, 0x0c,0x09,0x08,0x05,0x04,0x01,0x00 );
        __m128i factor_a  = _mm_loadu_si128 ( (__m128i*)&factor[8*j] );
        factor_a = _mm_shuffle_epi8( factor_a, mask);
        __m128i factor_b  = _mm_loadu_si128 ( (__m128i*)&factor[8*j+4] );
        factor_b = _mm_shuffle_epi8( factor_b, mask);
        factor_a = _mm_unpacklo_epi64( factor_a, factor_b );

        __m128i  level_a  = _mm_loadu_si128( (__m128i*)&src[8*j] );

        __m128i  threshold1_a = _mm_loadu_si128((__m128i*)&threshold[index][8*j] );
        threshold1_a = _mm_shuffle_epi8( threshold1_a, mask);
        __m128i  threshold1_b = _mm_loadu_si128((__m128i*)&threshold[index][8*j+4] );
        threshold1_b = _mm_shuffle_epi8( threshold1_b, mask);
        threshold1_a = _mm_unpacklo_epi64( threshold1_a, threshold1_b );
        __m128i  threshold2_a = _mm_slli_epi32( threshold1_a, 1 );

        __m128i mif = _mm_cmpgt_epi16( level_a, zero128 );
        //keep
        __m128i m0 = _mm_sub_epi16( level_a, threshold1_a );//( level - ( int )threshold1 )
        __m128i m1 = _mm_add_epi16( level_a, threshold1_a );//( level + ( int )threshold1 )
        __m128i m2 = _mm_slli_epi16( factor_a, 1);

        __m128i m3 = _mm_mullo_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];
        __m128i m4 = _mm_mulhi_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];
        __m128i m5 = _mm_mullo_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];
        __m128i m6 = _mm_mulhi_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];

        //keep
        m3 = _mm_blendv_epi8( m5, m3, mif);
        m4 = _mm_blendv_epi8( m6, m4, mif);

        m0 = _mm_add_epi16( level_a, threshold2_a );//( level+2*threshold1 )
        m1 = _mm_slli_epi16( threshold2_a, 1 );//2*threshold2
        m2 = _mm_max_epu16( m0, m1 );
        mif = _mm_cmpeq_epi16( m2, m0 );
        m0 = _mm_mullo_epi16( level_a, factor_a );
        m1 = _mm_mulhi_epi16( level_a, factor_a );

        //keep
        m0 = _mm_blendv_epi8( m3, m0, mif );
        m1 = _mm_blendv_epi8( m4, m1, mif );

        m2  = _mm_add_epi16( level_a, threshold1_a );
        m3  = _mm_max_epu16( m2, threshold2_a );
        mif = _mm_cmpeq_epi16( m3, m2);

        m0 = _mm_and_si128( mif, m0 );
        m1 = _mm_and_si128( mif, m1 );

        m2 = _mm_unpacklo_epi16( m0, m1 );
        m3 = _mm_unpackhi_epi16( m0, m1 );
        _mm_storeu_si128((__m128i*)&dst[8*j] , m2 );//will run fast if removed
        _mm_storeu_si128((__m128i*)&dst[8*j+4], m3 );//will run fast if removed
    }
    return 0;
}


内在版本不会比C版本更快。问题是如果我删除了for循环的最后两行,如代码_mm_storeu_si128((__m128i*)&dst[8*j] , m2)_mm_storeu_si128((__m128i*)&dst[8*j+4], m3)所示,内在版本将比c版本运行得快得多(快约4倍)。谁能解释为什么会这样? _mm_storeu_si128()是否花费那么多时间?谢谢

最佳答案

如果与C版本相同的速度,您可能会遇到内存带宽瓶颈。在那种情况下,是的,存储到内存是算法中最昂贵的事情。

或者,当结果没有存储在任何地方时,编译器会优化掉很多代码!您必须查看asm以确保它只保留了商店说明,而不是优化大部分功能。

请参见http://agner.org/optimize/,以及https://stackoverflow.com/tags/x86/info上的其他链接(特别是Ulrich Drepper关于缓存的论文。)

研究缓存阻止,又名循环切片。

关于c - _mm_storeu_si128花费太多时间?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/35081696/

10-11 21:00