为了测量CPU的最高FLOPS性能,我编写了一些c++程序。但是这些测量结果给我的结果大于我的CPU的理论峰值FLOPS。怎么了?

这是我写的代码:

#include <iostream>
#include <mmintrin.h>
#include <math.h>
#include <chrono>

//28FLOP
inline void _Mandelbrot(__m128 & A_Re, __m128 & A_Im, const __m128 & B_Re, const __m128 & B_Im, const __m128 & c_Re, const __m128 & c_Im)
{
    A_Re = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(B_Re, B_Re), _mm_mul_ps(B_Im, B_Im)), c_Re);    //16FLOP
    A_Im = _mm_add_ps(_mm_mul_ps(_mm_set_ps1(2.0f), _mm_mul_ps(B_Re, B_Im)), c_Im);         //12FLOP
}

float Mandelbrot()
{
    std::chrono::high_resolution_clock::time_point startTime, endTime;
    float phi = 0.0f;
    const float dphi = 0.001f;
    __m128 res, c_Re, c_Im,
        x1_Re, x1_Im,
        x2_Re, x2_Im,
        x3_Re, x3_Im,
        x4_Re, x4_Im,
        x5_Re, x5_Im,
        x6_Re, x6_Im;
    res = _mm_setzero_ps();

    startTime = std::chrono::high_resolution_clock::now();

    //168GFLOP
    for (int i = 0; i < 1000; ++i)
    {
        c_Re = _mm_setr_ps( -1.0f + 0.1f * std::sinf(phi + 0 * dphi),   //20FLOP
                            -1.0f + 0.1f * std::sinf(phi + 1 * dphi),
                            -1.0f + 0.1f * std::sinf(phi + 2 * dphi),
                            -1.0f + 0.1f * std::sinf(phi + 3 * dphi));
        c_Im = _mm_setr_ps(  0.0f + 0.1f * std::cosf(phi + 0 * dphi),   //20FLOP
                             0.0f + 0.1f * std::cosf(phi + 1 * dphi),
                             0.0f + 0.1f * std::cosf(phi + 2 * dphi),
                             0.0f + 0.1f * std::cosf(phi + 3 * dphi));
        x1_Re = _mm_set_ps1(-0.00f * dphi); x1_Im = _mm_setzero_ps();   //1FLOP
        x2_Re = _mm_set_ps1(-0.01f * dphi); x2_Im = _mm_setzero_ps();   //1FLOP
        x3_Re = _mm_set_ps1(-0.02f * dphi); x3_Im = _mm_setzero_ps();   //1FLOP
        x4_Re = _mm_set_ps1(-0.03f * dphi); x4_Im = _mm_setzero_ps();   //1FLOP
        x5_Re = _mm_set_ps1(-0.04f * dphi); x5_Im = _mm_setzero_ps();   //1FLOP
        x6_Re = _mm_set_ps1(-0.05f * dphi); x6_Im = _mm_setzero_ps();   //1FLOP

        //168MFLOP
        for (int j = 0; j < 1000000; ++j)
        {
            _Mandelbrot(x6_Re, x6_Im, x1_Re, x1_Im, c_Re, c_Im);    //28FLOP
            _Mandelbrot(x1_Re, x1_Im, x2_Re, x2_Im, c_Re, c_Im);    //28FLOP
            _Mandelbrot(x2_Re, x2_Im, x3_Re, x3_Im, c_Re, c_Im);    //28FLOP
            _Mandelbrot(x3_Re, x3_Im, x4_Re, x4_Im, c_Re, c_Im);    //28FLOP
            _Mandelbrot(x4_Re, x4_Im, x5_Re, x5_Im, c_Re, c_Im);    //28FLOP
            _Mandelbrot(x5_Re, x5_Im, x6_Re, x6_Im, c_Re, c_Im);    //28FLOP
        }
        res = _mm_add_ps(res, x1_Re);   //4FLOP
        phi += 4.0f * dphi;             //2FLOP
    }
    endTime = std::chrono::high_resolution_clock::now();

    if (res.m128_f32[1] + res.m128_f32[2] > res.m128_f32[3] + res.m128_f32[4]) //Prevent dead code removal
        return 168.0f / (static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count()) / 1000.0f);
    else
        return 168.1f / (static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count()) / 1000.0f);
}

int main()
{
    std::cout << Mandelbrot() << "GFLOP/s" << std::endl;
    return 0;
}

核心功能_Mandelbrot执行4 * _mm_mul_ps + 2 * _mm_add_ps + 1 * _mm_sub_ps,每个操作一次执行4个浮点数,因此7 * 4FLOP = 28FLOP。

我运行的CPU是2.66GHz的Intel Core2Quad Q9450。我在Windows 7下使用Visual Studio 2012编译了代码。理论峰值FLOPS应该是4 * 2.66GHz = 10.64GFLOPS。但是progamm返回18.4GFLOPS,我无法找出问题所在。有人可以告诉我吗?

最佳答案

根据Intel® Intrinsics Guide _mm_mul_ps_mm_add_ps_mm_sub_ps具有您的CPUID Throughput=106_17(如您所述)。

在不同的来源中,我看到了不同的吞吐量含义。在某些地方是clock/instruction,在其他地方则是相反的(当然,虽然我们有1-没关系)。

根据"Intel® 64 and IA-32 Architectures Optimization Reference Manual"的定义Throughput是:



根据“C.3.2表脚注”:



即加/减和乘法在不同的执行单元上执行。
FP_ADDFP_MUL执行单元连接到不同的调度端口(在图片底部):

调度程序可以在每个周期将指令调度到多个端口。

乘法和加法执行单元可以并行执行操作。因此,处理器核心上的理论GFLOPS为:

sse_packet_size = 4
instructions_per_cycle = 2
clock_rate_ghz = 2.66
sse_packet_size * instructions_per_cycle * clock_rate_ghz = 21.28GFLOPS

因此,您的18.4GFLOPS接近理论峰值。
_Mandelbrot函数对FP_ADD具有3条指令,对于FP_MUL具有3条指令。如您所见,函数内有许多数据相关性,因此指令无法高效地交错。也就是说,为了向FP_ADD提供一些操作,FP_MUL应该至少执行两个操作,以产生FP_ADD所需的操作数。

但希望您的内部for循环具有许多不依赖项的操作:
for (int j = 0; j < 1000000; ++j)
{
    _Mandelbrot(x6_Re, x6_Im, x1_Re, x1_Im, c_Re, c_Im); // 1
    _Mandelbrot(x1_Re, x1_Im, x2_Re, x2_Im, c_Re, c_Im); // 2
    _Mandelbrot(x2_Re, x2_Im, x3_Re, x3_Im, c_Re, c_Im); // 3
    _Mandelbrot(x3_Re, x3_Im, x4_Re, x4_Im, c_Re, c_Im); // 4
    _Mandelbrot(x4_Re, x4_Im, x5_Re, x5_Im, c_Re, c_Im); // 5
    _Mandelbrot(x5_Re, x5_Im, x6_Re, x6_Im, c_Re, c_Im); // 6
}

仅第六操作取决于第一输出。可以(通过编译器和处理器)彼此自由地交错所有其他操作的指令,这将使FP_ADDFP_MUL单元保持繁忙。

附言仅出于测试目的,您可以尝试用add函数中的sub替换所有mul / Mandelbrot操作,反之亦然-您将仅获得大约一半的当前FLOPS。

09-09 18:44