问题描述
我想在 CUDA 中实现这个原子函数:
__device__ 浮动最低;//全局变量__device__ int lowIdx;//全局变量浮点数;//线程注册变量整数索引;//线程注册变量如果(realNum
我不相信我可以用任何原子函数来做到这一点.我需要锁定几个全局内存位置以获取一些指令.我可以用 PTXAS(汇编)代码来实现这个吗?
正如我在上面的第二条评论中所述,可以将两个 32 位数量组合成一个 64 位原子管理数量,并处理那样的问题.然后我们使用 任意原子示例作为粗略指南.显然,您不能将这个想法扩展到两个 32 位数量之外.举个例子:
#include #define DSIZE 5000#define nTPB 256#define cudaCheckErrors(msg) 做 { cudaError_t __err = cudaGetLastError();if (__err != cudaSuccess) { fprintf(stderr, "致命错误: %s (%s at %s:%d)
", 味精,cudaGetErrorString(__err),\__FILE__, __LINE__);fprintf(stderr, *** 失败 - 正在中止
");退出(1);} } 而 (0)类型定义联合{浮动浮动[2];//浮点数[0] = 最低int ints[2];//ints[1] = lowIdx无符号长长整数 ulong;//原子更新我的原子;__device__ my_atomics 测试;__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2){my_atomics loc, loctest;loc.floats[0] = val1;loc.ints[1] = val2;loctest.ulong = *地址;而 (loctest.floats[0] > val1)loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);返回loctest.ulong;}__global__ void min_test(const float* data){int idx = (blockDim.x * blockIdx.x) + threadIdx.x;如果(idx
这里是一个类似的例子,它对 2 个 float
数量进行原子更新.>
I'd like to implement this atomic function in CUDA:
__device__ float lowest; // global var
__device__ int lowIdx; // global var
float realNum; // thread reg var
int index; // thread reg var
if(realNum < lowest) {
lowest= realNum; // the new lowest
lowIdx= index; // update the 'low' index
}
I don't believe I can do this with any of the atomic functions. I need to lock down a couple global memory loc's for a couple instructions.Might I be able to implement this with PTXAS (assembly) code?
As I stated in my second comment above, it's possible to combine your two 32-bit quantities into a single 64-bit atomically managed quantity, and deal with the problem that way. We then manage the 64-bit quantity atomically using the arbitrary atomic example as a rough guide. Obviously you can't extend this idea beyond two 32-bit quantities. Here's an example:
#include <stdio.h>
#define DSIZE 5000
#define nTPB 256
#define cudaCheckErrors(msg)
do {
cudaError_t __err = cudaGetLastError();
if (__err != cudaSuccess) {
fprintf(stderr, "Fatal error: %s (%s at %s:%d)
",
msg, cudaGetErrorString(__err),
__FILE__, __LINE__);
fprintf(stderr, "*** FAILED - ABORTING
");
exit(1);
}
} while (0)
typedef union {
float floats[2]; // floats[0] = lowest
int ints[2]; // ints[1] = lowIdx
unsigned long long int ulong; // for atomic update
} my_atomics;
__device__ my_atomics test;
__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
my_atomics loc, loctest;
loc.floats[0] = val1;
loc.ints[1] = val2;
loctest.ulong = *address;
while (loctest.floats[0] > val1)
loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong);
return loctest.ulong;
}
__global__ void min_test(const float* data)
{
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < DSIZE)
my_atomicMin(&(test.ulong), data[idx],idx);
}
int main() {
float *d_data, *h_data;
my_atomics my_init;
my_init.floats[0] = 10.0f;
my_init.ints[1] = DSIZE;
h_data = (float *)malloc(DSIZE * sizeof(float));
if (h_data == 0) {printf("malloc fail
"); return 1;}
cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
cudaCheckErrors("cm1 fail");
// create random floats between 0 and 1
for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cmcp1 fail");
cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
cudaCheckErrors("cmcp2 fail");
min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
cudaCheckErrors("cmcp3 fail");
printf("device min result = %f
", my_init.floats[0]);
printf("device idx result = %d
", my_init.ints[1]);
float host_val = 10.0f;
int host_idx = DSIZE;
for (int i=0; i<DSIZE; i++)
if (h_data[i] < host_val){
host_val = h_data[i];
host_idx = i;
}
printf("host min result = %f
", host_val);
printf("host idx result = %d
", host_idx);
return 0;
}
Here is a similar example that does atomic update of 2 float
quantities.
这篇关于如何实现涉及多个变量的自定义原子函数?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!