我的3D拉普拉斯求解器可以工作。我获得了350 Gflop / s的功率,我正在尝试对其进行升级,使其具有两倍于两倍的数据块,从而具有更好的性能。
但是,性能仍为350 Gflop / s:

 #include <iostream>
 #include <sys/time.h>
 #include <cuda.h>
 #include <ctime>
 #include"res3dcb.cuh"
 #include <math.h>
 using namespace std;

 // Constant statement.
 const int blocksize=32;
 const int N=128;
 const int size=(N+2)*(N+2)*(N+2)*sizeof(float);

 // Let's start the main program.
 int main(void) {

 // Variable statement.
 float time1,time2,time3;
 float *x_d, *y_d;
 float *x,*y;
 float gflops;
 float NumOps;
 int power=4; // You can change power as you prefer (but keep 2^x)

 // Init x and y.
 x = new float[size];
 y = new float[size];

 for (int k=1;k<N+1;k++)
    for (int i=1;i<N+1;i++)
        for (int j=1;j<N+1;j++) {
            x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
        }

 // Shadow cases.
 for (int k=1;k<N+1;k++) {
    for (int i=1;i<N+1;i++) {
      x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1];
      x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}

    for (int j=0;j<N+2;j++) {
      x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j];
      x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}

 for (int i=0;i<N+2;i++)
    for (int j=0;j<N+2;j++) {
        x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
        x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
    }

 // Display of initial matrix.
 int id_stage=-2;
 while (id_stage!=-1) {
     cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
     cin>>id_stage;
     cout<<endl;

     if (id_stage != -1) {
    cout<<"Etage "<<id_stage<<" du cube :"<<endl;
    for (int i=0;i<N+2;i++) {
        cout<<"| ";
        for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
        cout<<"|"<<endl;
        }
         cout<<endl;
     }
 }

 // CPU to GPU.
 cudaMalloc( (void**) & x_d, size);
 cudaMalloc( (void**) & y_d, size);

 cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
 cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;

 // Solver parameters.
 dim3 dimGrid(power*N/blocksize, power*N/blocksize);
 dim3 dimBlock(blocksize, blocksize);

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

 // Power calculation.
 NumOps=(1.0e-9)*N*N*N*7;
 gflops = ( NumOps / (time3));

 // GPU to CPU.
 cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
 cudaFree(x_d);
 cudaFree(y_d);

 // Display of final matrix.
 id_stage=-2;
 while (id_stage!=-1) {
    cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
    cin>>id_stage;
    cout<<endl;

     if (id_stage != -1) {
        cout<<"Etage "<<id_stage<<" du cube :"<<endl;
        for (int i=0;i<N+2;i++) {
            cout<<"| ";
            for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
            cout<<"|"<<endl;
         }
        cout<<endl;
     }
 }

 cout<<"Time : "<<time3<<endl;
 cout<<"Gflops/s : "<<gflops<<endl;

 }

哪里:
__ global__ void res2d(volatile float* x, float* y, int N, int power)
{
    int i = threadIdx.x + blockIdx.x*(blockDim.x);
    int j = threadIdx.y + blockIdx.y*(blockDim.y);
    int id,jd;

    #pragma unroll //Now let's recude the number of operation per block
    for (int incr=1; incr<power+1; incr++) {
        if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
            #pragma unroll
            for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
                id=i-(incr-1)*N;
                jd=j-(incr-1)*N;
                y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1]
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1]
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2]
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd]
                                                       + x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1]
                                                       + x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1]
                                                       - 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
            }
        }
    }
}

带有参数:
dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)

问题:
  • 如果power = 248,则每个块的操作数除以248。但这并不快。为什么?
  • 减少每个块的操作数没用吗?

  • 在此先感谢您的帮助。

    最佳答案

    CUDA内核启动是异步的。执行此操作时:

     // Solver loop.
     time1=clock();
    
     res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
    
     time2=clock();
     time3=(time2-time1)/CLOCKS_PER_SEC;
    

    计时器仅捕获API启动延迟,而不捕获代码的实际执行时间。这就是为什么更改内核中完成的工作量显然对性能没有影响的原因-您的计时方法不正确。

    做这样的事情:
     // Solver loop.
     time1=clock();
    
     res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
     cudaDeviceSynchronize();
    
     time2=clock();
     time3=(time2-time1)/CLOCKS_PER_SEC;
    

    这将插入一个阻塞调用,该调用将确保内核在测量时间之前完成执行。

    [此答案已添加为社区Wiki条目,以解决未解决的问题]。

    10-01 18:45