CUDA中使用多维数组

今天想起一个问题，看到的绝大多数CUDA代码都是使用的一维数组，是否可以在CUDA中使用一维数组，这是一个问题，想了各种问题，各种被77的错误状态码和段错误折磨，最后发现有一个cudaMallocManaged函数，这个函数可以很好的组织多维数组的多重指针的形式

，后来发现，这个问题之前在Stack Overflow中就有很好的解决。先贴一下我自己的代码实现：

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include <stdio.h>

 const int arraySize = ;

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);

 __global__ void addKernel(int **c, const int *a, const int *b)

 {

     int i = threadIdx.x;

     if(i<arraySize)

         c[][i] = a[i] + b[i];

     else

         c[][i-arraySize]= a[i-arraySize]+b[i-arraySize];

 }

 int main()

 {

     const int a[arraySize] = { , , , ,  };

     const int b[arraySize] = { , , , ,  };

     int c[arraySize] = {  };

     // Add vectors in parallel.

     cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addWithCuda failed!");

         return ;

     }

     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",

         c[], c[], c[], c[], c[]);

     // cudaThreadExit must be called before exiting in order for profiling and

     // tracing tools such as Nsight and Visual Profiler to show complete traces.

     cudaStatus = cudaThreadExit();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaThreadExit failed!");

         return ;

     }

     return ;

 }

 // Helper function for using CUDA to add vectors in parallel.

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)

 {

     int *dev_a = ;

     int *dev_b = ;

     int *dev_c0;

     int **dev_c ;

     cudaError_t cudaStatus;

     // Choose which GPU to run on, change this on a multi-GPU system.

     cudaStatus = cudaSetDevice();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

         goto Error;

     }

     // Allocate GPU buffers for three vectors (two input, one output)

     cudaStatus  =  cudaMallocManaged(&dev_c, *sizeof(int*));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**)&(dev_c0), size * sizeof(int)*);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     dev_c[]=dev_c0;

     dev_c[]=dev_c0+arraySize;

     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     // Copy input vectors from host memory to GPU buffers.

     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     // Launch a kernel on the GPU with one thread for each element.

     addKernel<<<, size*>>>(dev_c, dev_a, dev_b);

     // cudaThreadSynchronize waits for the kernel to finish, and returns

     // any errors encountered during the launch.

     cudaStatus = cudaThreadSynchronize();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

         goto Error;

     }

     // Copy output vector from GPU buffer to host memory.

     cudaStatus = cudaMemcpy(c, dev_c[], size * sizeof(int), cudaMemcpyDeviceToHost);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

 Error:

     cudaFree(dev_a);

     cudaFree(dev_b);

     return cudaStatus;

 }

在别人很好的代码下就不漏丑了，贴一下stack overflow的代码，非常直观易懂

 //https://stackoverflow.com/questions/40388242/multidimensional-array-allocation-with-cuda-unified-memory-on-power-8

 #include <iostream>

 #include <assert.h>

 template<typename T>

 T**** create_4d_flat(int a, int b, int c, int d) {

     T *base;

     cudaError_t err = cudaMallocManaged(&base, a*b*c*d * sizeof(T));

     assert(err == cudaSuccess);

     T ****ary;

     err = cudaMallocManaged(&ary, (a + a * b + a * b*c) * sizeof(T*));

     assert(err == cudaSuccess);

     for (int i = ; i < a; i++) {

         ary[i] = (T ***)((ary + a) + i * b);

         for (int j = ; j < b; j++) {

             ary[i][j] = (T **)((ary + a + a * b) + i * b*c + j * c);

             for (int k = ; k < c; k++)

                 ary[i][j][k] = base + ((i*b + j)*c + k)*d;

         }

     }

     return ary;

 }

 template<typename T>

 void free_4d_flat(T**** ary) {

     if (ary[][][]) cudaFree(ary[][][]);

     if (ary) cudaFree(ary);

 }

 template<typename T>

 __global__ void fill(T**** data, int a, int b, int c, int d) {

     unsigned long long int val = ;

     for (int i = ; i < a; i++)

         for (int j = ; j < b; j++)

             for (int k = ; k < c; k++)

                 for (int l = ; l < d; l++)

                     data[i][j][k][l] = val++;

 }

 void report_gpu_mem()

 {

     size_t free, total;

     cudaMemGetInfo(&free, &total);

     std::cout << "Free = " << free << " Total = " << total << std::endl;

 }

 int main() {

     report_gpu_mem();

     unsigned long long int ****data2;

     std::cout << "allocating..." << std::endl;

     data2 = create_4d_flat<unsigned long long int>(, , , );

     report_gpu_mem();

     fill << <,  >> > (data2, , , , );

     cudaError_t err = cudaDeviceSynchronize();

     assert(err == cudaSuccess);

     std::cout << "validating..." << std::endl;

     for (int i = ; i <  *  *  * ; i++)

         if (*(data2[][][] + i) != i) { std::cout << "mismatch at " << i << " was " << *(data2[][][] + i) << std::endl; return -; }

     free_4d_flat(data2);

     return ;

 }