0_Simple__cudaOpenMP

▶ 在OpenMP的多线程程序中，各线程分别调用CUDA进行计算。OpenMP的简单示例。
▶ 源代码，OpenMP 出了点问题，没有正确输出结果
 #include <stdio.h>

 #include <omp.h>

 #include <cuda.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 __global__ void kernelAddConstant(int *g_a, const int b)

 {

     int idx = blockIdx.x * blockDim.x + threadIdx.x;

     g_a[idx] += b;

 }

 int main(int argc, char *argv[])

 {

     const int num_gpus = ;

     unsigned int n = num_gpus * , nbytes = sizeof(int) * n;

     omp_set_num_threads(num_gpus);                                          // 使用CPU线程数量等于GPU设备数量。可以使用更多，如 2*num_gpus

     int b = ;

     int *a = (int *)malloc(nbytes);

     if (a == NULL)

     {

         printf("couldn't allocate CPU memory\n");

         return ;

     }

     for (unsigned int i = ; i < n; i++)

         a[i] = i;

 #pragma omp parallel num_threads(8)                                         // 强制使用 8 个线程

     {

         unsigned int thread_size = omp_get_num_threads(), thread_rank = omp_get_thread_num();

         int gpu_id = -;

         cudaSetDevice(thread_rank % num_gpus);                              // 使用 % 使得一个 GPU 能接受更多 CPU 线程

         cudaGetDevice(&gpu_id);

         printf("CPU thread %d (of %d) uses CUDA device %d\n", thread_rank, thread_size, gpu_id);

         int *d_a = NULL;

         int *sub_a = a + thread_rank * n / thread_size;                     // 主机内存分段，每个线程计算不同的分段

         unsigned int byte_per_kernel = nbytes / thread_size;

         cudaMalloc((void **)&d_a, byte_per_kernel);

         cudaMemset(d_a, , byte_per_kernel);

         cudaMemcpy(d_a, sub_a, byte_per_kernel, cudaMemcpyHostToDevice);

         dim3 gpu_threads();

         dim3 gpu_blocks(n / (gpu_threads.x * thread_size));

         kernelAddConstant << <gpu_blocks, gpu_threads >> >(d_a, b);

         cudaMemcpy(sub_a, d_a, byte_per_kernel, cudaMemcpyDeviceToHost);

         cudaFree(d_a);

     }

     if (cudaGetLastError() != cudaSuccess)                                  // 检查结果

         printf("%s\n", cudaGetErrorString(cudaGetLastError()));

     for (int i = ; i < n; i++)

     {

         if (a[i] != i + b)

         {

             printf("Error at i == %d, a[i] == %d", i, a[i]);

             break;

         }

     }

     printf("finish!\n");

     free(a);

     getchar();

     return ;

 }