参考stackoverflow一篇帖子的处理方法:https://stackoverflow.com/questions/26913683/different-way-to-index-threads-in-cuda-c

代码中cuda_gridsize函数参考yolo。

代码如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h" #include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <ctime> using namespace std;
#define BLOCK 512 dim3 cuda_gridsize(size_t n){
size_t k = (n - ) / BLOCK + ;
unsigned int x = k;
unsigned int y = ;
if (x > ){
x = ceil(sqrt(k));
y = (n - ) / (x*BLOCK) + ;
}
dim3 d = { x, y, };
//printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
return d;
} __global__ void gpuCalc(unsigned char *img,long H,long W)
{
long threadId_2D = threadIdx.x + threadIdx.y*blockDim.x;
long blockId_2D = blockIdx.x + blockIdx.y*gridDim.x;
long i = threadId_2D + (blockDim.x*blockDim.y)*blockId_2D; //另一种索引方式
//long i = (gridDim.x*blockDim.x)*(threadIdx.y + blockDim.y*blockIdx.y) + (threadIdx.x + blockDim.x*blockIdx.x); while (i < H*W){
img[i] = - img[i];
i += (gridDim.x*blockDim.x)*(gridDim.y*blockDim.y);
}
} void addWithCuda(unsigned char *img, long H,long W)
{
unsigned char *dev_a = ; cudaSetDevice(); cudaMalloc((void**)&dev_a, H*W * sizeof(unsigned char));
cudaMemcpy(dev_a, img, H*W * sizeof(unsigned char), cudaMemcpyHostToDevice); gpuCalc<<<cuda_gridsize(H*W),BLOCK>> >(dev_a, H, W); cudaMemcpy(img, dev_a, H*W * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cudaFree(dev_a); cudaGetLastError();
} void cpuCalc(unsigned char *img,long W, long H)
{
for (long i = ; i < H*W; i++)
img[i] = - img[i];
} int main()
{
long W = ;
long H = ; unsigned char *img = new unsigned char[W*H];
unsigned char *cmp = new unsigned char[W*H]; for (long i = ; i < H*W; i++)
img[i] = rand() % ; memcpy(cmp, img, H*W); cpuCalc(img, W, H);
printf("cpu calc end\n"); addWithCuda(img, W,H);
printf("gpu calc end\n"); bool flag = true;
for (long i = ; i < H*W; i++)
{
if (img[i] != cmp[i])
{
printf("no pass\n");
flag = false;
break;
}
}
if (flag)
printf("pass"); delete[] cmp;
delete[] img;
getchar(); return ;
}
05-11 14:48