本文介绍了面向NVIDIA Kepler架构的ILP的效率和性能的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

引用NVIDIA提供的"Kepler Tuning Guide":

Quoting the "Kepler Tuning Guide" provided by NVIDIA:

我认为以下代码段

a = .....;
a2 = f(a); 
a3 = g(a2);  

可以进行如下改进

a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);

所以在我的项目中,我有一段代码如下(示例1)

So in my projects, I have a section of code as follows (example 1)

if(x < src.cols && y < src.rows)
{
    if(!mask(y,x))
    {
        src.ptr(y)[x] = make_short4(0,0,0,0);
    }
}

我将其重写如下(示例2)

and I rewrite it as follows (example2)

if(x < src.cols && y < src.rows)
{
    if(!mask(y,x))
    {
        short4 t;
        t.x = 0;
        t.y = 0;
        t.z = 0;
        t.w = 0;
        src.ptr(y)[x].x = t.x;
        src.ptr(y)[x].y = t.y;
        src.ptr(y)[x].z = t.z;
        src.ptr(y)[x].w = t.w;  
     }
}

在开普勒架构中,example2比example1效率更高,表现出更好的性能,对吗?

In the Kepler architecture, the example2 will be more efficient and exhibit better performance than example1, is that right?

推荐答案

指令级并行性(ILP)的很好解释可以在 CUDA性能:最大化指令级并行度.

A good explanation on Instruction Level Parallelism (ILP) can be found at CUDA Performance: Maximizing Instruction-Level Parallelism.

Robert Crovella和talonmies指出了这一点,并且您自己也认识到,您的上述示例未达到ILP.

It has been pointed out by Robert Crovella and talonmies, and it has been recognized by yourself, that your example above does not reach ILP.

关于如何实现ILP,我在下面的经典示例中显示,该示例是从PyCUDA代码转换而成的,该代码位于 numbapro范例,我已经针对Fermi和Kepler GPU进行了测试.请注意,对于后一种情况,我没有观察到相关的加速.

Concerning how implementing ILP, I'm showing below the classical example, translated from the PyCUDA code at numbapro-examples, which I have tested for a Fermi and for a Kepler GPU. Please, notice that for the latter case I have not observed relevant speedups.

代码

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE 64

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    d_c[i] = d_a[i] + d_b[i];

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    int m = l + stride;
    float am = d_a[m];
    float bm = d_b[m];

    int n = m + stride;
    float an = d_a[n];
    float bn = d_b[n];

    int p = n + stride;
    float ap = d_a[p];
    float bp = d_b[p];

    int q = p + stride;
    float aq = d_a[q];
    float bq = d_b[q];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;
    float cm = am + bm;
    float cn = an + bn;
    float cp = ap + bp;
    float cq = aq + bq;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;
    d_c[m] = cm;
    d_c[n] = cn;
    d_c[p] = cp;
    d_c[q] = cq;

}

/********/
/* MAIN */
/********/
void main() {

    float timing;
    cudaEvent_t start, stop;

    const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE

    float* a = (float*)malloc(N*sizeof(float));
    float* b = (float*)malloc(N*sizeof(float));
    float* c = (float*)malloc(N*sizeof(float));
    float* c_ref = (float*)malloc(N*sizeof(float));

    srand(time(NULL));
    for (int i=0; i<N; i++) {

        a[i] = rand() / RAND_MAX;
        b[i] = rand() / RAND_MAX;
        c_ref[i] = a[i] + b[i];

    }

    float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
    float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
    float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
    float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
    float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
    float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));

    /******************/
    /* ILP0 TEST CASE */
    /******************/
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP0:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP2 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP2:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP4 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP4:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP8 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP8:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("%f %f\n",c[i],c_ref[i]);
            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

}

性能

Card                    Kernel          Time [ms]            Speedup
GeForce GT540M          ILP0            4.609                1
      "                 ILP2            2.666                1.72
      "                 ILP4            1.675                2.76
      "                 ILP8            1.477                3.12

Kepler K20c             ILP0            0.045                
      "                 ILP2            0.043                
      "                 ILP4            0.043                
      "                 ILP8            0.042                

这篇关于面向NVIDIA Kepler架构的ILP的效率和性能的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

09-22 16:45