我编写了一个简单的程序来实现SSE内在函数,以计算两个大(100000个或更多元素)向量的内积。该程序将比较常规方法和使用内在函数计算出的内部乘积的执行时间。一切工作正常,直到我在计算内部乘积的语句之前插入(只是为了好玩)一个内部循环。在继续之前,下面是代码:
//this is a sample Intrinsics program to compute inner product of two vectors and compare Intrinsics with traditional method of doing things.
#include <iostream>
#include <iomanip>
#include <xmmintrin.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
using namespace std;
typedef float v4sf __attribute__ ((vector_size(16)));
double innerProduct(float* arr1, int len1, float* arr2, int len2) { //assume len1 = len2.
float result = 0.0;
for(int i = 0; i < len1; i++) {
for(int j = 0; j < len1; j++) {
result += (arr1[i] * arr2[i]);
}
}
//float y = 1.23e+09;
//cout << "y = " << y << endl;
return result;
}
double sse_v4sf_innerProduct(float* arr1, int len1, float* arr2, int len2) { //assume that len1 = len2.
if(len1 != len2) {
cout << "Lengths not equal." << endl;
exit(1);
}
/*steps:
* 1. load a long-type (4 float) into a v4sf type data from both arrays.
* 2. multiply the two.
* 3. multiply the same and store result.
* 4. add this to previous results.
*/
v4sf arr1Data, arr2Data, prevSums, multVal, xyz;
//__builtin_ia32_xorps(prevSums, prevSums); //making it equal zero.
//can explicitly load 0 into prevSums using loadps or storeps (Check).
float temp[4] = {0.0, 0.0, 0.0, 0.0};
prevSums = __builtin_ia32_loadups(temp);
float result = 0.0;
for(int i = 0; i < (len1 - 3); i += 4) {
for(int j = 0; j < len1; j++) {
arr1Data = __builtin_ia32_loadups(&arr1[i]);
arr2Data = __builtin_ia32_loadups(&arr2[i]); //store the contents of two arrays.
multVal = __builtin_ia32_mulps(arr1Data, arr2Data); //multiply.
xyz = __builtin_ia32_addps(multVal, prevSums);
prevSums = xyz;
}
}
//prevSums will hold the sums of 4 32-bit floating point values taken at a time. Individual entries in prevSums also need to be added.
__builtin_ia32_storeups(temp, prevSums); //store prevSums into temp.
cout << "Values of temp:" << endl;
for(int i = 0; i < 4; i++)
cout << temp[i] << endl;
result += temp[0] + temp[1] + temp[2] + temp[3];
return result;
}
int main() {
clock_t begin, end;
int length = 100000;
float *arr1, *arr2;
double result_Conventional, result_Intrinsic;
// printStats("Allocating memory.");
arr1 = new float[length];
arr2 = new float[length];
// printStats("End allocation.");
srand(time(NULL)); //init random seed.
// printStats("Initializing array1 and array2");
begin = clock();
for(int i = 0; i < length; i++) {
// for(int j = 0; j < length; j++) {
// arr1[i] = rand() % 10 + 1;
arr1[i] = 2.5;
// arr2[i] = rand() % 10 - 1;
arr2[i] = 2.5;
// }
}
end = clock();
cout << "Time to initialize array1 and array2 = " << ((double) (end - begin)) / CLOCKS_PER_SEC << endl;
// printStats("Finished initialization.");
// printStats("Begin inner product conventionally.");
begin = clock();
result_Conventional = innerProduct(arr1, length, arr2, length);
end = clock();
cout << "Time to compute inner product conventionally = " << ((double) (end - begin)) / CLOCKS_PER_SEC << endl;
// printStats("End inner product conventionally.");
// printStats("Begin inner product using Intrinsics.");
begin = clock();
result_Intrinsic = sse_v4sf_innerProduct(arr1, length, arr2, length);
end = clock();
cout << "Time to compute inner product with intrinsics = " << ((double) (end - begin)) / CLOCKS_PER_SEC << endl;
//printStats("End inner product using Intrinsics.");
cout << "Results: " << endl;
cout << " result_Conventional = " << result_Conventional << endl;
cout << " result_Intrinsics = " << result_Intrinsic << endl;
return 0;
}
我使用以下g++调用来构建它:
g++ -W -Wall -O2 -pedantic -march=i386 -msse intrinsics_SSE_innerProduct.C -o innerProduct
在两个函数中,以上每个循环总共运行N ^ 2次。但是,假定arr1和arr2(两个浮点向量)的值都为2.5,则数组的长度为100,000,则两种情况下的结果均应为6.25e + 10。我得到的结果是:
结果:
result_Conventional = 6.25e + 10
result_Intrinsics = 5.36871e + 08
这还不是全部。从使用内在函数的函数返回的值似乎在上述值“饱和”。我也尝试为数组的元素和其他大小放入其他值。但是似乎数组内容的任何大于1.0的值和大于1000的任何大小都可以满足我们在上面看到的相同值。
最初,我认为这可能是因为SSE中的所有操作都在浮点数中,但是浮点数应该能够存储大约e + 08的数字。
我正在尝试找出可能出问题的地方,但似乎无法弄清楚。我正在使用g++版本:g++(GCC)4.4.1 20090725(Red Hat 4.4.1-2)。
欢迎对此提供任何帮助。
谢谢,
斯里拉姆
最佳答案
您遇到的问题是,虽然float
可以存储6.25e + 10,但是它的精度只有几位有效数字。
这意味着当您一次通过一次将许多小数字加在一起来构建一个大数字时,您会到达一个小数字小于大数字中最低精度数字的点,因此将其相加无效。
至于为什么在非本征版本中未得到此行为,则result
变量可能保存在寄存器中,该寄存器使用比浮点数的实际存储更高的精度,因此不会被截断为的精度。循环的每次迭代上的float
。您必须确保查看生成的汇编代码。
关于g++ SSE内在困境-来自内在 "saturates"的值,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/2947550/