问题描述
CUDA 5,设备功能3.5,VS 2012,64位Win 2012服务器。
线程之间没有共享内存访问,每个线程都是独立的。 >
我使用带零复制的固定内存。从主机,我只能读取设备写入的固定内存,只有当我在主机上发出 cudaDeviceSynchronize
。
在每次设备写入后,我试着调用 __ threadfence_system
和 __ threadfence
。
下面是一个完整的示例CUDA代码,演示了我的问题:
#include< conio.h>
#include< cstdio>
#includecuda.h
#includecuda_runtime.h
#includedevice_launch_parameters.h
__global__ void Kernel(volatile float * hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf(Kernel%u:Before Writing in Kernel\\\
,tid);
hResult [tid] = tid + 1;
__threadfence_system();
//希望数据在这里被刷新到主机!
printf(Kernel%u:After Writing in Kernel\\\
,tid);
// time waster for-loop(sleep)
for(int timeWater = 0; timeWater< 100000000; timeWater ++);
}
void main()
{
size_t blocks = 2;
volatile float * hResult;
cudaHostAlloc((void **)& hResult,blocks * sizeof(float),cudaHostAllocMapped);
Kernel<<<< 1,块>>>(hResult);
int filledElementsCounter = 0;
//可以使用
进行冲突的另一个线程实现//另一个主线程
while(filledElementsCounter {
//阻塞,直到值改变,这顺序移动
//而线程没有顺序(对于这个示例很好)。
while(hResult [filledElementsCounter] == 0);
printf(%f \\\
,hResult [filledElementsCounter]);;
filledElementsCounter ++;
}
cudaFreeHost((void *)hResult);
system(pause);
}
目前,此示例将无限期等待,因为没有从设备读取任何内容,除非发出 cudaDeviceSynchronize
。下面的示例工作,但它是不我想要的,因为它违反了异步复制的目的:
void main()
{
size_t blocks = 2;
volatile float * hResult;
cudaHostAlloc((void **)& hResult,blocks * sizeof(float),cudaHostAllocMapped);
Kernel<<<< 1,块>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if(error!= cudaSuccess){throw; }
for(int i = 0; i {
printf(%f \\\
,hResult [i]);
}
cudaFreeHost((void *)hResult);
system(pause);
}
使用CUDA 5.5和Tesla M2090的Centos 6.2可以得出这样的结论:
它不适用于您的系统的问题必须是驱动程序问题,我建议您获得TCC驱动程序。
我附加了我运行正常的代码,并做你想要的。这些值在内核结束之前显示在主机端。正如你可以看到,我添加了一些计算代码,以防止由于编译器优化的for循环被删除。我添加了一个流和回调,在流中的所有工作完成后执行。程序输出 1
2
并且很长一段时间不执行任何操作直到流完成...
打印到控制台。
#include< iostream&
#includecuda.h
#includecuda_runtime.h
#includedevice_launch_parameters.h
#define SEC_CUDA_CALL(val)checkCall val),#val,__FILE__,__LINE__)
bool checkCall(cudaError_t result,char const * const func,const char * const file,int const line)
{
if (result!= cudaSuccess)
{
std :: cout< CUDA(runtime api)error:< func< failed!<< cudaGetErrorString(result)<< (<<< result<<)<文件<< :<线<< std :: endl;
}
返回结果!= cudaSuccess;
}
类回调
{
public:
static void CUDART_CB dispatch(cudaStream_t stream,cudaError_t status,void * userData);
private:
void call();
};
void CUDART_CB Callback :: dispatch(cudaStream_t stream,cudaError_t status,void * userData)
{
Callback * cb =(Callback *)userData;
cb-> call();
}
void Callback :: call()
{
std :: cout< stream finished ...< std :: endl;
}
__global__ void内核(volatile float * hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult [tid] = tid + 1;
__threadfence_system();
float A = 0;
for(int timeWater = 0; timeWater {
A = sin(cos(log(hResult [0] * hResult [1])))+ A;
A = sqrt(A);
}
}
int main(int argc,char * argv [])
{
size_t blocks = 2;
volatile float * hResult;
SEC_CUDA_CALL(cudaHostAlloc((void **)& hResult,blocks * sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(& stream,cudaStreamNonBlocking));
Callback obj;
Kernel<<<<< 1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream,Callback :: dispatch,& obj,0));
int filledElementsCounter = 0;
while(filledElementsCounter {
while(hResult [filledElementsCounter] == 0);
std :: cout<< hResult [filledElementsCounter]< std :: endl;
filledElementsCounter ++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
没有调用返回错误,cuda-memcheck没有发现任何问题。这按预期工作。你应该真的尝试TCC驱动程序。
CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize
on the host.
I tried calling __threadfence_system
and __threadfence
after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize
. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}
I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1
2
and for a long time does nothing until stream finished...
is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.
这篇关于CUDA固定内存从设备刷新的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!