众所周知,CPU是流水线,如果命令序列彼此独立,则它的工作效率最高。这称为ILP(指令级并行):http://en.wikipedia.org/wiki/Instruction-level_parallelism

但是,是否有一个真正可行的示例,至少在语法示例中显示了ILP的CPU x86_64的优势(但在两种情况下cmp/jne的数量相同)?

我将编写以下示例-将数组的所有元素加起来,但它没有显示ILP的任何优势:http://ideone.com/fork/poWfsm

  • 顺序:

  •         for(i = 0; i < arr_size; i += 8) {
                result += arr[i+0] + arr[i+1] +
                        arr[i+2] + arr[i+3] +
                        arr[i+4] + arr[i+5] +
                        arr[i+6] + arr[i+7];
            }
    
  • ILP:

  •         register unsigned int v0, v1, v2, v3;
            v0 = v1 = v2 = v3 = 0;
            for(i = 0; i < arr_size; i += 8) {
                v0 += arr[i+0] + arr[i+1];
                v1 += arr[i+2] + arr[i+3];
                v2 += arr[i+4] + arr[i+5];
                v3 += arr[i+6] + arr[i+7];
            }
            result = v0+v1+v2+v3;
    

    结果:



    ILP比Sequential慢一点。

    C代码:http://ideone.com/fork/poWfsm

    #include <time.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    int main() {
        // create and init array
        const size_t arr_size = 100000000;
        unsigned int *arr = (unsigned int*) malloc(arr_size * sizeof(unsigned int));
        size_t i, k;
        for(i = 0; i < arr_size; ++i)
            arr[i] = 10;
    
        unsigned int result = 0;
        clock_t start, end;
        const int c_iterations = 10;    // iterations of experiment
        float faster_avg = 0;
        // -----------------------------------------------------------------
    
    
        for(k = 0; k < c_iterations; ++k) {
            result = 0;
    
            // Sequential
            start = clock();
    
            for(i = 0; i < arr_size; i += 8) {
                result += arr[i+0] + arr[i+1] +
                        arr[i+2] + arr[i+3] +
                        arr[i+4] + arr[i+5] +
                        arr[i+6] + arr[i+7];
            }
    
            end = clock();
            const float c_time_seq = (float)(end - start)/CLOCKS_PER_SEC;
            printf("seq: %f sec, res: %u, ", c_time_seq, result);
            // -----------------------------------------------------------------
    
            result = 0;
    
            // IPL-optimization
            start = clock();
    
            register unsigned int v0, v1, v2, v3;
            v0 = v1 = v2 = v3 = 0;
    
            for(i = 0; i < arr_size; i += 8) {
    
                v0 += arr[i+0] + arr[i+1];
                v1 += arr[i+2] + arr[i+3];
                v2 += arr[i+4] + arr[i+5];
                v3 += arr[i+6] + arr[i+7];
    
    
            }
            result = v0+v1+v2+v3;
    
    
            end = clock();
            const float c_time_ipl = (float)(end - start)/CLOCKS_PER_SEC;
            const float c_faster = c_time_seq/c_time_ipl;
    
            printf("ipl: %f sec, faster %f X, res: %u \n", c_time_ipl, c_faster, result);
            faster_avg += c_faster;
        }
    
        faster_avg = faster_avg/c_iterations;
        printf("faster AVG: %f \n", faster_avg);
    
        return 0;
    }
    

    更新:
  • 顺序(反汇编程序MS Visual Studio 2013):

  •     for (i = 0; i < arr_size; i += 8) {
            result += arr[i + 0] + arr[i + 1] +
                arr[i + 2] + arr[i + 3] +
                arr[i + 4] + arr[i + 5] +
                arr[i + 6] + arr[i + 7];
        }
    
    000000013F131080  mov         ecx,dword ptr [rdx-18h]
    000000013F131083  lea         rdx,[rdx+20h]
    000000013F131087  add         ecx,dword ptr [rdx-34h]
    000000013F13108A  add         ecx,dword ptr [rdx-30h]
    000000013F13108D  add         ecx,dword ptr [rdx-2Ch]
    000000013F131090  add         ecx,dword ptr [rdx-28h]
    000000013F131093  add         ecx,dword ptr [rdx-24h]
    000000013F131096  add         ecx,dword ptr [rdx-1Ch]
    000000013F131099  add         ecx,dword ptr [rdx-20h]
    000000013F13109C  add         edi,ecx
    000000013F13109E  dec         r8
    000000013F1310A1  jne         main+80h (013F131080h)
    
  • ILP(反汇编程序MS Visual Studio 2013):

  •     for (i = 0; i < arr_size; i += 8) {
            v0 += arr[i + 0] + arr[i + 1];
    000000013F1310F0  mov         ecx,dword ptr [rdx-0Ch]
            v1 += arr[i + 2] + arr[i + 3];
            v2 += arr[i + 4] + arr[i + 5];
    000000013F1310F3  mov         eax,dword ptr [rdx+8]
    000000013F1310F6  lea         rdx,[rdx+20h]
    000000013F1310FA  add         ecx,dword ptr [rdx-28h]
    000000013F1310FD  add         eax,dword ptr [rdx-1Ch]
    000000013F131100  add         ebp,ecx
    000000013F131102  mov         ecx,dword ptr [rdx-24h]
    000000013F131105  add         ebx,eax
    000000013F131107  add         ecx,dword ptr [rdx-20h]
            v3 += arr[i + 6] + arr[i + 7];
    000000013F13110A  mov         eax,dword ptr [rdx-10h]
            v3 += arr[i + 6] + arr[i + 7];
    000000013F13110D  add         eax,dword ptr [rdx-14h]
    000000013F131110  add         esi,ecx
    000000013F131112  add         edi,eax
    000000013F131114  dec         r8
    000000013F131117  jne         main+0F0h (013F1310F0h)
        }
        result = v0 + v1 + v2 + v3;
    

    编译器命令行:
    /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc120.pdb" /fp:precise /D "_MBCS" /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MT /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\IPL_reduce_test.pch"
    

    答案的其他说明:

    一个简单的示例显示了对于50000000个双元素数组,Unroll-loop和Unroll-loop + ILP之间的ILP的好处:http://ideone.com/LgTP6b


  • 可以由CPU管道优化的伪序列(反汇编MS Visual Studio 2013)-在每次迭代中添加8个元素时,使用临时寄存器xmm0,然后将其添加到结果xmm6中,即可以使用Register renaming:
  • li

    result += arr[i + 0] + arr[i + 1] + arr[i + 2] + arr[i + 3] +
        arr[i + 4] + arr[i + 5] + arr[i + 6] + arr[i + 7];
    000000013FBA1090  movsd       xmm0,mmword ptr [rcx-10h]
    000000013FBA1095  add         rcx,40h
    000000013FBA1099  addsd       xmm0,mmword ptr [rcx-48h]
    000000013FBA109E  addsd       xmm0,mmword ptr [rcx-40h]
    000000013FBA10A3  addsd       xmm0,mmword ptr [rcx-38h]
    000000013FBA10A8  addsd       xmm0,mmword ptr [rcx-30h]
    000000013FBA10AD  addsd       xmm0,mmword ptr [rcx-28h]
    000000013FBA10B2  addsd       xmm0,mmword ptr [rcx-20h]
    000000013FBA10B7  addsd       xmm0,mmword ptr [rcx-18h]
    000000013FBA10BC  addsd       xmm6,xmm0
    000000013FBA10C0  dec         rdx
    000000013FBA10C3  jne         main+90h (013FBA1090h)
    
  • 不能由CPU管道优化的True-sequential (Disassembler MS Visual Studio 2013)-在每次迭代中添加8个元素都使用结果寄存器xmm6,即不能使用Register renaming:

  •             result += arr[i + 0];
    000000013FFC1090  addsd       xmm6,mmword ptr [rcx-10h]
    000000013FFC1095  add         rcx,40h
                result += arr[i + 1];
    000000013FFC1099  addsd       xmm6,mmword ptr [rcx-48h]
                result += arr[i + 2];
    000000013FFC109E  addsd       xmm6,mmword ptr [rcx-40h]
                result += arr[i + 3];
    000000013FFC10A3  addsd       xmm6,mmword ptr [rcx-38h]
                result += arr[i + 4];
    000000013FFC10A8  addsd       xmm6,mmword ptr [rcx-30h]
                result += arr[i + 5];
    000000013FFC10AD  addsd       xmm6,mmword ptr [rcx-28h]
                result += arr[i + 6];
    000000013FFC10B2  addsd       xmm6,mmword ptr [rcx-20h]
                result += arr[i + 7];
    000000013FFC10B7  addsd       xmm6,mmword ptr [rcx-18h]
    000000013FFC10BC  dec         rdx
    000000013FFC10BF  jne         main+90h (013FFC1090h)
    

    最佳答案

    在大多数英特尔处理器上,需要3个周期来执行浮点加法。但是,如果它们是独立的,则最多可以维持1个周期。

    通过在关键路径上添加浮点加法,我们可以轻松地演示ILP。

    环境:

  • GCC 4.8.2:-O2
  • 桑迪桥至强
    确保编译器不执行不安全的浮点优化。

    #include <iostream>
    using namespace std;
    
    #include <time.h>
    
    const int iterations = 1000000000;
    
    double sequential(){
        double a = 2.3;
        double result = 0;
    
        for (int c = 0; c < iterations; c += 4){
            //  Every add depends on the previous add. No ILP is possible.
            result += a;
            result += a;
            result += a;
            result += a;
        }
    
        return result;
    }
    double optimized(){
        double a = 2.3;
        double result0 = 0;
        double result1 = 0;
        double result2 = 0;
        double result3 = 0;
    
        for (int c = 0; c < iterations; c += 4){
            //  4 independent adds. Up to 4 adds can be run in parallel.
            result0 += a;
            result1 += a;
            result2 += a;
            result3 += a;
        }
    
        return result0 + result1 + result2 + result3;
    }
    
    int main(){
    
        clock_t start0 = clock();
        double sum0 = sequential();
        clock_t end0 = clock();
        cout << "sum = " << sum0 << endl;
        cout << "sequential time: " << (double)(end0 - start0) / CLOCKS_PER_SEC << endl;
    
        clock_t start1 = clock();
        double sum1 = optimized();
        clock_t end1 = clock();
        cout << "sum = " << sum1 << endl;
        cout << "optimized time:  " << (double)(end1 - start1) / CLOCKS_PER_SEC << endl;
    
    }
    

    输出:
    sum = 2.3e+09
    sequential time: 0.948138
    sum = 2.3e+09
    optimized time:  0.317293
    

    注意差异几乎是3倍。这是因为浮点添加的3周期延迟和1周期吞吐量。

    顺序版本的ILP很少,因为所有浮点添加都位于关键路径上。 (每个添加都需要等待,直到完成上一个添加为止)。展开的版本具有4个独立的依赖项链,其中包含多达4个独立的添加项-所有这些都可以并行运行。只需3个即可饱和处理器内核。

    07-27 17:02