我的3D拉普拉斯求解器可以工作。我获得了350 Gflop / s的功率,我正在尝试对其进行升级,使其具有两倍于两倍的数据块,从而具有更好的性能。
但是,性能仍为350 Gflop / s:
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;
// Constant statement.
const int blocksize=32;
const int N=128;
const int size=(N+2)*(N+2)*(N+2)*sizeof(float);
// Let's start the main program.
int main(void) {
// Variable statement.
float time1,time2,time3;
float *x_d, *y_d;
float *x,*y;
float gflops;
float NumOps;
int power=4; // You can change power as you prefer (but keep 2^x)
// Init x and y.
x = new float[size];
y = new float[size];
for (int k=1;k<N+1;k++)
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++) {
x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
}
// Shadow cases.
for (int k=1;k<N+1;k++) {
for (int i=1;i<N+1;i++) {
x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1];
x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}
for (int j=0;j<N+2;j++) {
x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j];
x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}
for (int i=0;i<N+2;i++)
for (int j=0;j<N+2;j++) {
x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
}
// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);
cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;
// Solver parameters.
dim3 dimGrid(power*N/blocksize, power*N/blocksize);
dim3 dimBlock(blocksize, blocksize);
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time3));
// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);
// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
cout<<"Time : "<<time3<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
哪里:
__ global__ void res2d(volatile float* x, float* y, int N, int power)
{
int i = threadIdx.x + blockIdx.x*(blockDim.x);
int j = threadIdx.y + blockIdx.y*(blockDim.y);
int id,jd;
#pragma unroll //Now let's recude the number of operation per block
for (int incr=1; incr<power+1; incr++) {
if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
#pragma unroll
for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
id=i-(incr-1)*N;
jd=j-(incr-1)*N;
y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd]
+ x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1]
+ x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1]
- 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
}
}
}
}
带有参数:
dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)
问题:
power
= 2
,4
或8
,则每个块的操作数除以2
,4
或8
。但这并不快。为什么? 在此先感谢您的帮助。
最佳答案
CUDA内核启动是异步的。执行此操作时:
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
计时器仅捕获API启动延迟,而不捕获代码的实际执行时间。这就是为什么更改内核中完成的工作量显然对性能没有影响的原因-您的计时方法不正确。
做这样的事情:
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
cudaDeviceSynchronize();
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
这将插入一个阻塞调用,该调用将确保内核在测量时间之前完成执行。
[此答案已添加为社区Wiki条目,以解决未解决的问题]。