▶ 本章介绍了手动实现原子操作。重构了第五章向量点积的过程。核心是通过定义结构Lock及其运算,实现锁定,读写,解锁的过程。
● 章节代码
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "D:\Code\CUDA\book\common\book.h" #define imin(a,b) (a<b?a:b)
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
#define N 33 * 1024 * 1024
#define THREADSIZE 256
#define BLOCKSIZE imin(32, (N + THREADSIZE - 1) / THREADSIZE) struct Lock
{
int *mutex;
Lock(void)
{
int state = ;
cudaMalloc((void **)&mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void)
{
cudaFree(mutex);
}
__device__ void lock(void)
{
while (atomicCAS(mutex, , ) != );
//atomicCAS(a, b, c)将判断变量a是否等于b,
//若相等,则用c的值去替换a,并返回c的值;若不相等,则返回a的值
//函数lock()中,线程不断尝试判断mutex是否为0,
//若为0则改写为1 ,表明“占用”,禁止其他线程进行访问
//若为1则继续尝试判断
}
__device__ void unlock(void)
{
atomicExch(mutex, );
//atomicExch(a, b)返回第一个变量的值,并将两个变量的值进行交换
//这里使用原子操作只是与上面的atomicCAS统一,否则可以直接用赋值语句
//线程操作完成,将mutex改写回0,允许其他线程进行访问
}
}; __global__ void dot(Lock lock, float *a, float *b, float *c)
{
__shared__ float share[THREADSIZE];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = ; while (tid < N)
{
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
} share[cacheIndex] = temp;
__syncthreads(); int i = blockDim.x / ;
while (i != )
{
if (cacheIndex < i)
share[cacheIndex] += share[cacheIndex + i];
__syncthreads();
i /= ;
}
if (cacheIndex == )
{
lock.lock();// 等待可写入的机会,锁上,写入,再解锁
*c += share[];
lock.unlock();
}
} int main(void)
{
float *a, *b, c = ;
float *dev_a, *dev_b, *dev_c; a = (float*)malloc(N * sizeof(float));
b = (float*)malloc(N * sizeof(float)); cudaMalloc((void**)&dev_a, N * sizeof(float));
cudaMalloc((void**)&dev_b, N * sizeof(float));
cudaMalloc((void**)&dev_c, sizeof(float)); for (int i = ; i < N; i++)
{
a[i] = i;
b[i] = i * ;
} cudaMemcpy(dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, &c, sizeof(float), cudaMemcpyHostToDevice); Lock lock;
dot << <BLOCKSIZE, THREADSIZE >> > (lock, dev_a, dev_b, dev_c); cudaMemcpy(&c, dev_c, sizeof(float), cudaMemcpyDeviceToHost); printf("\n\tAnswer:\t\t%.6g\n\tGPU value:\t%.6g\n", * sum_squares((float)(N - )), c); free(a);
free(b);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
getchar();
return ;
}