为了测量CPU的最高FLOPS性能,我编写了一些c++程序。但是这些测量结果给我的结果大于我的CPU的理论峰值FLOPS。怎么了?
这是我写的代码:
#include <iostream>
#include <mmintrin.h>
#include <math.h>
#include <chrono>
//28FLOP
inline void _Mandelbrot(__m128 & A_Re, __m128 & A_Im, const __m128 & B_Re, const __m128 & B_Im, const __m128 & c_Re, const __m128 & c_Im)
{
A_Re = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(B_Re, B_Re), _mm_mul_ps(B_Im, B_Im)), c_Re); //16FLOP
A_Im = _mm_add_ps(_mm_mul_ps(_mm_set_ps1(2.0f), _mm_mul_ps(B_Re, B_Im)), c_Im); //12FLOP
}
float Mandelbrot()
{
std::chrono::high_resolution_clock::time_point startTime, endTime;
float phi = 0.0f;
const float dphi = 0.001f;
__m128 res, c_Re, c_Im,
x1_Re, x1_Im,
x2_Re, x2_Im,
x3_Re, x3_Im,
x4_Re, x4_Im,
x5_Re, x5_Im,
x6_Re, x6_Im;
res = _mm_setzero_ps();
startTime = std::chrono::high_resolution_clock::now();
//168GFLOP
for (int i = 0; i < 1000; ++i)
{
c_Re = _mm_setr_ps( -1.0f + 0.1f * std::sinf(phi + 0 * dphi), //20FLOP
-1.0f + 0.1f * std::sinf(phi + 1 * dphi),
-1.0f + 0.1f * std::sinf(phi + 2 * dphi),
-1.0f + 0.1f * std::sinf(phi + 3 * dphi));
c_Im = _mm_setr_ps( 0.0f + 0.1f * std::cosf(phi + 0 * dphi), //20FLOP
0.0f + 0.1f * std::cosf(phi + 1 * dphi),
0.0f + 0.1f * std::cosf(phi + 2 * dphi),
0.0f + 0.1f * std::cosf(phi + 3 * dphi));
x1_Re = _mm_set_ps1(-0.00f * dphi); x1_Im = _mm_setzero_ps(); //1FLOP
x2_Re = _mm_set_ps1(-0.01f * dphi); x2_Im = _mm_setzero_ps(); //1FLOP
x3_Re = _mm_set_ps1(-0.02f * dphi); x3_Im = _mm_setzero_ps(); //1FLOP
x4_Re = _mm_set_ps1(-0.03f * dphi); x4_Im = _mm_setzero_ps(); //1FLOP
x5_Re = _mm_set_ps1(-0.04f * dphi); x5_Im = _mm_setzero_ps(); //1FLOP
x6_Re = _mm_set_ps1(-0.05f * dphi); x6_Im = _mm_setzero_ps(); //1FLOP
//168MFLOP
for (int j = 0; j < 1000000; ++j)
{
_Mandelbrot(x6_Re, x6_Im, x1_Re, x1_Im, c_Re, c_Im); //28FLOP
_Mandelbrot(x1_Re, x1_Im, x2_Re, x2_Im, c_Re, c_Im); //28FLOP
_Mandelbrot(x2_Re, x2_Im, x3_Re, x3_Im, c_Re, c_Im); //28FLOP
_Mandelbrot(x3_Re, x3_Im, x4_Re, x4_Im, c_Re, c_Im); //28FLOP
_Mandelbrot(x4_Re, x4_Im, x5_Re, x5_Im, c_Re, c_Im); //28FLOP
_Mandelbrot(x5_Re, x5_Im, x6_Re, x6_Im, c_Re, c_Im); //28FLOP
}
res = _mm_add_ps(res, x1_Re); //4FLOP
phi += 4.0f * dphi; //2FLOP
}
endTime = std::chrono::high_resolution_clock::now();
if (res.m128_f32[1] + res.m128_f32[2] > res.m128_f32[3] + res.m128_f32[4]) //Prevent dead code removal
return 168.0f / (static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count()) / 1000.0f);
else
return 168.1f / (static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count()) / 1000.0f);
}
int main()
{
std::cout << Mandelbrot() << "GFLOP/s" << std::endl;
return 0;
}
核心功能_Mandelbrot执行4 * _mm_mul_ps + 2 * _mm_add_ps + 1 * _mm_sub_ps,每个操作一次执行4个浮点数,因此7 * 4FLOP = 28FLOP。
我运行的CPU是2.66GHz的Intel Core2Quad Q9450。我在Windows 7下使用Visual Studio 2012编译了代码。理论峰值FLOPS应该是4 * 2.66GHz = 10.64GFLOPS。但是progamm返回18.4GFLOPS,我无法找出问题所在。有人可以告诉我吗?
最佳答案
根据Intel® Intrinsics Guide _mm_mul_ps
,_mm_add_ps
,_mm_sub_ps
具有您的CPUID Throughput=1
的06_17
(如您所述)。
在不同的来源中,我看到了不同的吞吐量含义。在某些地方是clock/instruction
,在其他地方则是相反的(当然,虽然我们有1
-没关系)。
根据"Intel® 64 and IA-32 Architectures Optimization Reference Manual"的定义Throughput
是:
根据“C.3.2表脚注”:
即加/减和乘法在不同的执行单元上执行。FP_ADD
和FP_MUL
执行单元连接到不同的调度端口(在图片底部):
调度程序可以在每个周期将指令调度到多个端口。
乘法和加法执行单元可以并行执行操作。因此,处理器核心上的理论GFLOPS为:
sse_packet_size = 4
instructions_per_cycle = 2
clock_rate_ghz = 2.66
sse_packet_size * instructions_per_cycle * clock_rate_ghz = 21.28GFLOPS
因此,您的18.4GFLOPS接近理论峰值。
_Mandelbrot
函数对FP_ADD具有3条指令,对于FP_MUL具有3条指令。如您所见,函数内有许多数据相关性,因此指令无法高效地交错。也就是说,为了向FP_ADD提供一些操作,FP_MUL应该至少执行两个操作,以产生FP_ADD所需的操作数。但希望您的内部
for
循环具有许多不依赖项的操作:for (int j = 0; j < 1000000; ++j)
{
_Mandelbrot(x6_Re, x6_Im, x1_Re, x1_Im, c_Re, c_Im); // 1
_Mandelbrot(x1_Re, x1_Im, x2_Re, x2_Im, c_Re, c_Im); // 2
_Mandelbrot(x2_Re, x2_Im, x3_Re, x3_Im, c_Re, c_Im); // 3
_Mandelbrot(x3_Re, x3_Im, x4_Re, x4_Im, c_Re, c_Im); // 4
_Mandelbrot(x4_Re, x4_Im, x5_Re, x5_Im, c_Re, c_Im); // 5
_Mandelbrot(x5_Re, x5_Im, x6_Re, x6_Im, c_Re, c_Im); // 6
}
仅第六操作取决于第一输出。可以(通过编译器和处理器)彼此自由地交错所有其他操作的指令,这将使
FP_ADD
和FP_MUL
单元保持繁忙。附言仅出于测试目的,您可以尝试用
add
函数中的sub
替换所有mul
/ Mandelbrot
操作,反之亦然-您将仅获得大约一半的当前FLOPS。