__device__类成员函数更改设备变量的值后，从设备复制到主机时出现cudaMemcpy错误

本文介绍了__device__类成员函数更改设备变量的值后，从设备复制到主机时出现cudaMemcpy错误的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

对于我编写的CUDA代码的行为，我感到困惑。我正在为名为 DimmedGridGPU 的类中的 __ device __ 函数编写测试的过程中。此类在 int DIM 上进行模板化，而我遇到麻烦的函数旨在返回最靠近输入值 x 。我有这个内核命名空间用于单元测试，可以单独调用每个 __ device __ 函数。

此代码的预期行为是从 do_get_value（x，grid _）调用返回值 3.0 并设置 d_target [0] 到此值，然后将其传送回主机进行单元测试声明。整个内核似乎正常运行，但是当我最后一次传输回主机端时，我收到了 cudaErrorInvalidValue 错误，并且我不明白为什么。 / p>

这是代码的最小示例，保留了类的结构及其功能：

 ＃包括< cuda_runtime.h> 
 #include< fstream> 
 
 #define gpuErrchk（ans）{gpuAssert（（ans），__FILE__，__LINE__）; } 
 inline void gpuAssert（cudaError_t代码，const char * file，int行，bool abort = true）
 {
 if（code！= cudaSuccess）
 {
 fprintf（stderr， GPUassert：\％s\：％s％s％d\n，cudaGetErrorName（code），cudaGetErrorString（code），文件，行）； 
 if（abort）exit（code）; 
} 
} 
 
 
模板< int DIM> 
 DimmedGridGPU {类
 
 public：
 size_t grid_size _; //网格总大小
 int b_derivatives _; //如果要使用导数
 int b_interpolate _; //如果在网格上使用插值
 double * grid _; //网格值
 double * grid_deriv _; //导数
 double dx_ [DIM]; //网格间距
 double min_ [DIM]; //网格最小
 double max_ [DIM]; //最大
 int grid_number_ [DIM]; //网格
 int的点数b_periodic_ [DIM]; //如果维度是周期性的
 int * d_b_interpolate_; 
 int * d_b_derivatives_; 
 
 
 DimmedGridGPU（const double * min，
 const double * max，
 const double * bin_spacing，
 const int * b_periodic，
 int b_derivatives，
 int b_interpolate）：b_derivatives_（b_derivatives），b_interpolate_（b_interpolate），grid_（NULL），grid_deriv_（NULL）{
 
 size_t i; 
 
 for（i = 0; i< DIM; i ++）{
 min_ [i] = min [i]; 
 max_ [i] = max [i]; 
 b_periodic_ [i] = b_periodic [i]; 
 
 grid_number_ [i] =（int）ceil（（max_ [i]-min_ [i]）/ bin_spacing [i]）; 
 dx_ [i] =（max_ [i]-min_ [i]）/ grid_number_ [i]; 
 //如果
 grid_number_ [i] = b_periodic_ [i]则将一个加到网格点上？ grid_number_ [i]：grid_number_ [i] + 1; 
 //增加dx以补偿
 if（！b_periodic_ [i]）
 max_ [i] + = dx_ [i]; 
} 
 
 grid_size_ = 1; 
 for（i = 0; i< DIM; i ++）
 grid_size_ * = grid_number_ [i]; 
 gpuErrchk（cudaMallocManaged（& grid_，grid_size_ * sizeof（double））））; 
 if（b_derivatives_）{
 gpuErrchk（cudaMallocManaged（& grid_deriv_，DIM * grid_size_ * sizeof（double））））; 
 if（！grid_deriv_）{
 printf（内存不足！gpugrid.cuh：initialize）; 
} 
} 
 
 gpuErrchk（cudaMalloc（（void **）& d_b_interpolate_，sizeof（int）））; 
 gpuErrchk（cudaMemcpy（d_b_interpolate_，& b_interpolate，sizeof（int），cudaMemcpyHostToDevice））; 
 gpuErrchk（cudaMalloc（（void **）& d_b_derivatives_，sizeof（int）））; 
 gpuErrchk（cudaMemcpy（d_b_derivatives_，& b_derivatives，sizeof（int），cudaMemcpyHostToDevice））; 
} 
 
〜DimmedGridGPU（）{
 gpuErrchk（cudaDeviceSynchronize（））; 
 if（grid_！= NULL）{
 gpuErrchk（cudaFree（grid_））; 
 grid_ = NULL; //需要执行此操作，以便DimmedGrid的析构函数正常运行
} 
 
 if（grid_deriv_！= NULL）{
 gpuErrchk（cudaFree（grid_deriv_） ）; 
 grid_deriv_ = NULL; 
} 
 
 gpuErrchk（cudaDeviceReset（））; 
} 
 //获取最接近x的网格的值
 __host__ __device__ double do_get_value（double * x，double * grid_）{
 
 size_t index [DIM ]; 
 get_index（x，index）; 
 printf（在GPU上调用了do_get_value ！，索引[0]现在为％d\n，索引[0]）； 
 printf（但multi2one（index）给我们％d\n，multi2one（index））； 
 double value = grid_ [multi2one（index）]; 
 printf（并且要返回的值是％f\n，值）； 
的返回值； 
} 
 //从坐标数组
中获取网格的一维索引
 __host__ __device__ void get_index（const double * x，size_t result [DIM]）const {
 size_t i; 
 double xi; 
 printf（在GPU上以％i维度\n调用get_index， DIM）； 
 for（i = 0; i< DIM; i ++）{
 xi = x [i]; 
 printf（ xi现在是％f，min_ [i]是％f，dx_ [i]是％f\n，xi，min_ [i]，dx_ [i]）； 
 if（b_periodic_ [i]）{
 xi-=（max_ [i]-min_ [i]）* gpu_int_floor（（xi-min_ [i]）/（max_ [i]-min_ [一世]））; 
} 
 result [i] =（size_t）floor（（xi-min_ [i]）/ dx_ [i]）; 
} 
} 
 //将多维索引转换为一维索引
 __host__ __device__ size_t multi2one（const size_t index [DIM]）const {
 size_t result = index [DIM-1]； 
 
 size_t i; 
 for（i = DIM-1; i> 0; i--）{
结果=结果* grid_number_ [i-1] +索引[i-1]; 
} 
 
返回结果； 
 
} 
 
}; 
 
 __host__ __device__ int gpu_int_floor（double number）{
 return（int）number< 0.0？ -ceil（fabs（number））：floor（number）; 
} 
 
 
命名空间内核{
 template< int DIM> 
 __global__ void get_value_kernel（double * x，double * target_arr，double * grid_，DimmedGridGPU< DIM> g）{
 target_arr [0] = g.do_get_value（x，grid_）; 
 printf（ get_value_kernel已将target [0]设置为％f\n，target_arr [0]）; //检查该值是否设置正确
 return; 
} 
} 
 
 
 int main（）{
使用名称空间内核； 
 double min [] = {0}; 
 double max [] = {10}; 
 double bin_spacing [] = {1}; 
 int period [] = {0}; 
 DimmedGridGPU< 1> g（最小，最大，bin_spacing，周期，0、0）; 
 for（int i = 0; i< 11; i ++）{
 g.grid_ [i] = i; 
 printf（ g.grid _ [％d]现在为％f\n，即g.grid_ [i]）； 
} 
 gpuErrchk（cudaDeviceSynchronize（））; 
 double x [] = {3.5}; 
 
 double * d_x; 
 gpuErrchk（cudaMalloc（& d_x，sizeof（double）））; 
 gpuErrchk（cudaMemcpy（d_x，x，sizeof（double），cudaMemcpyHostToDevice））; 
 double target [] = {5.0}; 
 double * d_target; 
 gpuErrchk（cudaMalloc（（void **）& d_target，sizeof（double）））; 
 gpuErrchk（cudaMemcpy（d_target，target，sizeof（double），cudaMemcpyHostToDevice））; 
 gpuErrchk（cudaDeviceSynchronize（））; 
 get_value_kernel< 1< 1,1>>（d_x，d_target，g.grid_，g）; 
 gpuErrchk（cudaDeviceSynchronize（））; 
 gpuErrchk（cudaMemcpy（target，d_target，sizeof（double），cudaMemcpyDeviceToHost））; 
 printf（并且在GPU填充之后，目标[0]现在为％f\n，目标[0]）； 
 return（0）; 
}

所以，为什么这行（最后一个 cudaMemcpy ）抛出错误 CudaErrorInvalidValue ，当我包含的打印语句清楚地表明设备上使用了正确的值时，返回的值通过 do_get_value（x，grid _）调用是正确的吗？

我已经尝试使用 cudaMemcpyFromSymbol ，认为分配可能是在创建符号而不是通过某种方式传递和更改值，但事实并非如此，因为 d_target 无效符号。

这是我的代码的示例输出：

解决方案

问题与您的析构函数有关：

 〜DimmedGridGPU（）{

析构函数在您可能不期望的地方被调用。为了使自己信服，请在析构函数中添加 printf 语句。请注意它在打印输出中出现的位置：

  $ ./t955 
 g.grid_ [0]现在为0.000000 
 g.grid_ [1]现在是1.000000 
 g.grid_ [2]现在是2.000000 
 g.grid_ [3]现在是3.000000 
 g.grid_ [4]现在4.000000 
 g.grid_ [5]现在是5.000000 
 g.grid_ [6]现在是6.000000 
 g.grid_ [7]现在是7.000000 
 g.grid_ [8]现在是8.000000 
 g.grid_ [9]现在是9.000000 
 g.grid_ [10]现在是10.000000 
析构函数！ 
 get_index在1维上在GPU上被调用
 xi现在是3.500000，min_ [i]是0.000000，dx_ [i]是1.000000 
 do_get_value在GPU上被调用了！，并且index [0]现在是3 
，但是multi2one（index）给我们3 
，要返回的值是3.000000 
 get_value_kernel已将target [0]设置为3.000000 
 GPUassert ： cudaErrorInvalidValue：无效参数t955.cu 167

鉴于此，应该很明显地调用现在，在该析构函数中的 cudaDeviceReset（）似乎是个坏主意。 cudaDeviceReset（）会擦除所有设备分配，因此当您尝试执行此操作时：

  gpuErrchk（cudaMemcpy（target，d_target，sizeof（double），cudaMemcpyDeviceToHost））;

d_target 不再是有效分配设备，因此当您尝试将其用作 cudaMemcpy 的设备目标时，运行时将检查此指针值（设备重置不会更改该指针值）并确定指针值不再对应于有效分配，并引发运行时错误。

。

我建议将这样的全局作用域函数放在cudaDeviceReset（）

为避免出现下一个可能的问题，只需注释掉对 cudaDeviceReset（）可能不足以使所有问题消失（尽管这一特定问题会消失）。既然您知道此析构函数在该程序的常规执行中至少被调用两次，则您可能需要仔细考虑该析构函数中发生的其他事情，也许可以去除更多内容

 
 
 例如，请注意， cudaDeviceReset（）不是唯一的该函数可能会在使用这种方法的对象的析构函数中引起麻烦。同样，当在对象副本上调用的析构函数中使用 cudaFree（）可能会对原始对象产生意想不到的后果。
 
I am confused as to the behavior of the CUDA code I have written. I am in the midst of writing tests for my __device__ functions in a class called DimmedGridGPU. This class is templated on an int DIM and the function I have trouble with is meant to return the value of the grid at the point nearest the input value, x. I have this kernel namespace for unit testing purposes, to call each __device__ function in isolation.
The desired behavior of this code would be to return the value 3.0 from the do_get_value(x, grid_) call, and set d_target[0] to this value, then transfer it back to the host side for unit test assertions. The whole of the kernel seems to function properly, but when I do the final transfer back to the host side, I receive a cudaErrorInvalidValue error, and I do not understand why.
Here is a minimal example of the code, preserving the structure of the class and its features:
#include <cuda_runtime.h>
#include <fstream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
     fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}


template <int DIM>
class DimmedGridGPU{

public:
  size_t grid_size_;//total size of grid
  int b_derivatives_;//if derivatives are going to be used
  int b_interpolate_;//if interpolation should be used on the grid
  double* grid_;//the grid values
  double* grid_deriv_;//derivatives
  double dx_[DIM];//grid spacing
  double min_[DIM];//grid minimum
  double max_[DIM];//maximum
  int grid_number_[DIM];//number of points on grid
  int b_periodic_[DIM];//if a dimension is periodic
  int* d_b_interpolate_;
  int* d_b_derivatives_;


  DimmedGridGPU(const double* min,
        const double* max,
        const double* bin_spacing,
        const int* b_periodic,
        int b_derivatives,
        int b_interpolate) :   b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){

    size_t i;

    for(i = 0; i < DIM; i++) {
      min_[i] = min[i];
      max_[i] = max[i];
      b_periodic_[i] = b_periodic[i];

      grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
      dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
      //add one to grid points if
      grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
      //increment dx to compensate
      if(!b_periodic_[i])
    max_[i] += dx_[i];
    }

    grid_size_ = 1;
    for(i = 0; i < DIM; i++)
      grid_size_ *= grid_number_[i];
    gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
    if(b_derivatives_) {
      gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
      if(!grid_deriv_) {
    printf("Out of memory!! gpugrid.cuh:initialize");
      }
    }

    gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
  }

  ~DimmedGridGPU(){
    gpuErrchk(cudaDeviceSynchronize());
    if(grid_ != NULL){
      gpuErrchk(cudaFree(grid_));
      grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
    }

    if(grid_deriv_ != NULL){
      gpuErrchk(cudaFree(grid_deriv_));
      grid_deriv_ = NULL;
    }

    gpuErrchk(cudaDeviceReset());
  }
//gets the value of the grid closest to x
  __host__ __device__ double do_get_value( double* x, double* grid_) {

    size_t index[DIM];
    get_index(x, index);
    printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
    printf("but multi2one(index) gives us %d\n", multi2one(index));
    double value = grid_[multi2one(index)];
    printf("and value to be returned is %f\n", value);
    return value;
  }
//gets grid's 1D index from an array of coordinates
   __host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
    size_t i;
    double xi;
    printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
    for(i = 0; i < DIM; i++) {
      xi = x[i];
      printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
      if(b_periodic_[i]){
    xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
      }
      result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
    }
  }
//takes a multidimensional index to a 1D index
  __host__ __device__ size_t multi2one(const size_t index[DIM]) const {
    size_t result = index[DIM-1];

    size_t i;
    for(i = DIM - 1; i > 0; i--) {
      result = result * grid_number_[i-1] + index[i-1];
    }

    return result;

  }

};

__host__ __device__ int gpu_int_floor(double number) {
  return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}


namespace kernels{
  template <int DIM>
  __global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM>  g){
    target_arr[0] = g.do_get_value(x, grid_);
    printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
    return;
  }
}


int main(){
  using namespace kernels;
  double min[] = {0};
  double max[] = {10};
  double bin_spacing[] = {1};
  int periodic[] = {0};
  DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
  for(int i = 0; i < 11; i++){
    g.grid_[i] = i;
    printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
  }
  gpuErrchk(cudaDeviceSynchronize());
  double x[] = {3.5};

  double* d_x;
  gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
  double target[] = {5.0};
  double* d_target;
  gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
  gpuErrchk(cudaDeviceSynchronize());
  get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
  gpuErrchk(cudaDeviceSynchronize());
  gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
  printf("and after GPU stuff, target[0] is now %f\n", target[0]);
  return(0);
}
So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue", when the print statements I have included clearly demonstrate that the correct values are being used on the device, and the value returned by the do_get_value(x, grid_) call is correct?
I have already tried using cudaMemcpyFromSymbol, thinking that perhaps the assignment was creating a symbol instead of passing and changing a value somehow, but that is not the case, as d_target is not a valid symbol.
Here is sample output from my code:
 解决方案 
The problem revolves around your destructor:
  ~DimmedGridGPU(){
The destructor is getting called in places you probably aren't expecting.  To convince yourself of this, add a printf statement to the destructor.  Note where it appears in the printout:
$ ./t955
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
Destructor!
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument t955.cu 167
Given that, it should be pretty evident that calling cudaDeviceReset() in that destructor now seems like a bad idea.  The cudaDeviceReset() wipes out all device allocations, so then when you attempt to do this:
gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
d_target is no longer a valid allocation on the device, so when you attempt to use it as the device target for cudaMemcpy, the runtime checks this pointer value (which is not changed by the device reset) and determines that the pointer value no longer corresponds to a valid allocation, and throws a runtime error.
Just like in C++ when you pass an object to a function (or a kernel in this case) as a pass-by-value parameter, the copy constructor for that object gets called.  It stands to reason when that object copy goes out of scope, the destructor for it will be called.
I would suggest that putting such global-scope affecting functions as cudaDeviceReset() in an object destructor might be a fragile programming paradigm, but that is perhaps a matter of opinion.  I assume you now have enough information to go about fixing the issue.
To avoid the next possible question, simply commenting out that call to cudaDeviceReset() in your destructor may not be sufficient to make all problems disappear (although this particular one will).  Now that you know that this destructor is being called at least twice in the ordinary execution of this program, you may want to think carefully about what else is going on in that destructor, and perhaps strip more things out of it, or else rearchitect your class altogether.
For example, note that cudaDeviceReset() is not the only function that can cause trouble in a destructor for objects used this way.  Similarly, cudaFree() may have unintended consequences on the original object, when used in a destructor called on the object-copy.
                        这篇关于__device__类成员函数更改设备变量的值后，从设备复制到主机时出现cudaMemcpy错误的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！