CUDA固定内存从设备刷新 | CUDA固定内存从设备刷新

本文介绍了CUDA固定内存从设备刷新的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

CUDA 5，设备功能3.5，VS 2012，64位Win 2012服务器。

线程之间没有共享内存访问，每个线程都是独立的。 >

我使用带零复制的固定内存。从主机，我只能读取设备写入的固定内存，只有当我在主机上发出 cudaDeviceSynchronize 。

在每次设备写入后，我试着调用 __ threadfence_system 和 __ threadfence 。

下面是一个完整的示例CUDA代码，演示了我的问题：

  #include< conio.h> 
 #include< cstdio> 
 #includecuda.h
 #includecuda_runtime.h
 #includedevice_launch_parameters.h
 
 __global__ void Kernel（volatile float * hResult） 
 {
 int tid = threadIdx.x + blockIdx.x * blockDim.x; 
 
 printf（Kernel％u：Before Writing in Kernel\\\
，tid）; 
 hResult [tid] = tid + 1; 
 __threadfence_system（）; 
 //希望数据在这里被刷新到主机！ 
 printf（Kernel％u：After Writing in Kernel\\\
，tid）; 
 // time waster for-loop（sleep）
 for（int timeWater = 0; timeWater< 100000000; timeWater ++）; 
} 
 
 void main（）
 {
 size_t blocks = 2; 
 volatile float * hResult; 
 cudaHostAlloc（（void **）& hResult，blocks * sizeof（float），cudaHostAllocMapped）; 
 Kernel<<<< 1，块>>>（hResult）; 
 int filledElementsCounter = 0; 
 //可以使用
进行冲突的另一个线程实现//另一个主线程
 while（filledElementsCounter  {
 //阻塞，直到值改变，这顺序移动
 //而线程没有顺序（对于这个示例很好）。 
 while（hResult [filledElementsCounter] == 0）; 
 printf（％f \\\
，hResult [filledElementsCounter]）;; 
 filledElementsCounter ++; 
} 
 cudaFreeHost（（void *）hResult）; 
 system（pause）; 
}

目前，此示例将无限期等待，因为没有从设备读取任何内容，除非发出 cudaDeviceSynchronize 。下面的示例工作，但它是不我想要的，因为它违反了异步复制的目的：

  void main（）
 {
 size_t blocks = 2; 
 volatile float * hResult; 
 cudaHostAlloc（（void **）& hResult，blocks * sizeof（float），cudaHostAllocMapped）; 
 Kernel<<<< 1，块>>>（hResult）; 
 cudaError_t error = cudaDeviceSynchronize（）; 
 if（error！= cudaSuccess）{throw; } 
 for（int i = 0; i  {
 printf（％f \\\
，hResult [i]）; 
} 
 cudaFreeHost（（void *）hResult）; 
 system（pause）; 
}

解决方案

使用CUDA 5.5和Tesla M2090的Centos 6.2可以得出这样的结论：

它不适用于您的系统的问题必须是驱动程序问题，我建议您获得TCC驱动程序。

我附加了我运行正常的代码，并做你想要的。这些值在内核结束之前显示在主机端。正如你可以看到，我添加了一些计算代码，以防止由于编译器优化的for循环被删除。我添加了一个流和回调，在流中的所有工作完成后执行。程序输出 1 2 并且很长一段时间不执行任何操作直到流完成... 打印到控制台。

  #include< iostream& 
 #includecuda.h
 #includecuda_runtime.h
 #includedevice_launch_parameters.h
 
 #define SEC_CUDA_CALL（val）checkCall val），#val，__FILE__，__LINE__）
 
 bool checkCall（cudaError_t result，char const * const func，const char * const file，int const line）
 {
 if （result！= cudaSuccess）
 {
 std :: cout< CUDA（runtime api）error：< func< failed！<< cudaGetErrorString（result）<< （<<< result<<）<文件<< ：<线<< std :: endl; 
} 
返回结果！= cudaSuccess; 
} 
 
类回调
 {
 public：
 static void CUDART_CB dispatch（cudaStream_t stream，cudaError_t status，void * userData）; 
 
 private：
 void call（）; 
}; 
 
 void CUDART_CB Callback :: dispatch（cudaStream_t stream，cudaError_t status，void * userData）
 {
 Callback * cb =（Callback *）userData; 
 cb-> call（）; 
} 
 
 void Callback :: call（）
 {
 std :: cout< stream finished ...< std :: endl; 
} 
 
 
 
 __global__ void内核（volatile float * hResult）
 {
 int tid = threadIdx.x + blockIdx.x * blockDim.x; 
 
 hResult [tid] = tid + 1; 
 __threadfence_system（）; 
 float A = 0; 
 for（int timeWater = 0; timeWater  {
 A = sin（cos（log（hResult [0] * hResult [1]）））+ A; 
 A = sqrt（A）; 
} 
} 
 
 int main（int argc，char * argv []）
 {
 size_t blocks = 2; 
 volatile float * hResult; 
 SEC_CUDA_CALL（cudaHostAlloc（（void **）& hResult，blocks * sizeof（float），cudaHostAllocMapped））; 
 
 cudaStream_t stream; 
 SEC_CUDA_CALL（cudaStreamCreateWithFlags（& stream，cudaStreamNonBlocking））; 
 Callback obj; 
 Kernel<<<<< 1，blocks，NULL，stream>>>（hResult）; 
 SEC_CUDA_CALL（cudaStreamAddCallback（stream，Callback :: dispatch，& obj，0））; 
 
 int filledElementsCounter = 0; 
 
 while（filledElementsCounter  {
 while（hResult [filledElementsCounter] == 0）; 
 std :: cout<< hResult [filledElementsCounter]< std :: endl; 
 filledElementsCounter ++; 
} 
 
 SEC_CUDA_CALL（cudaStreamDestroy（stream））; 
 SEC_CUDA_CALL（cudaFreeHost（（void *）hResult））; 
}

没有调用返回错误，cuda-memcheck没有发现任何问题。这按预期工作。你应该真的尝试TCC驱动程序。

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.

There is no shared memory access between threads, every thread is standalone.

I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.

I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.

Below is a full sample CUDA code that demonstrates my question:

#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void Kernel(volatile float* hResult)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    printf("Kernel %u: Before Writing in Kernel\n", tid);
    hResult[tid] = tid + 1;
    __threadfence_system();
    // expecting that the data is getting flushed to host here!
    printf("Kernel %u: After Writing in Kernel\n", tid);
    // time waster for-loop (sleep)
    for (int timeWater = 0; timeWater  < 100000000; timeWater++);
}

void main()
{
    size_t blocks = 2;
    volatile float* hResult;
    cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
    Kernel<<<1,blocks>>>(hResult);
    int filledElementsCounter = 0;
    // naiive thread implementation that can be impelemted using
    // another host thread
    while (filledElementsCounter < blocks)
    {
        // blocks until the value changes, this moves sequentially
        // while threads have no order (fine for this sample).
        while(hResult[filledElementsCounter] == 0);
        printf("%f\n", hResult[filledElementsCounter]);;
        filledElementsCounter++;
    }
    cudaFreeHost((void *)hResult);
    system("pause");
}

Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:

void main()
{
    size_t blocks = 2;
    volatile float* hResult;
    cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
    Kernel<<<1,blocks>>>(hResult);
    cudaError_t error = cudaDeviceSynchronize();
    if (error != cudaSuccess) { throw; }
    for(int i = 0; i < blocks; i++)
    {
        printf("%f\n", hResult[i]);
    }
    cudaFreeHost((void *)hResult);
    system("pause");
}

解决方案

I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:

The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.

I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.

 #include <iostream>
 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"

 #define SEC_CUDA_CALL(val)           checkCall  ( (val), #val, __FILE__, __LINE__ )

 bool checkCall(cudaError_t result, char const* const func,  const char *const file, int const line)
 {
    if (result != cudaSuccess)
    {
            std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
    }
    return result != cudaSuccess;
}

class Callback
{
public:
    static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);

private:
    void call();
};

void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
    Callback* cb = (Callback*) userData;
    cb->call();
}

void Callback::call()
{
     std::cout << "stream finished..." << std::endl;
}



__global__ void Kernel(volatile float* hResult)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    hResult[tid] = tid + 1;
    __threadfence_system();
    float A = 0;
    for (int timeWater = 0; timeWater  < 100000000; timeWater++)
    {
        A = sin(cos(log(hResult[0] * hResult[1]))) + A;
        A = sqrt(A);
    }
}

int main(int argc, char* argv[])
{
    size_t blocks = 2;
    volatile float* hResult;
    SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));

    cudaStream_t stream;
    SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    Callback obj;
    Kernel<<<1,blocks,NULL,stream>>>(hResult);
    SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));

    int filledElementsCounter = 0;

    while (filledElementsCounter < blocks)
    {
        while(hResult[filledElementsCounter] == 0);
        std::cout << hResult[filledElementsCounter] << std::endl;
        filledElementsCounter++;
    }

    SEC_CUDA_CALL(cudaStreamDestroy(stream));
    SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}

No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.