c++ - 来自cudaGetErrorString(err)的未知错误

在cudaT.h中调用cudaMemcpy时出现未知错误。
编译时收到此警告，在第46、50、52行重复了多次：

./gameoflife.cu(46）：警告：假设全局内存空间不足，无法告诉指针指向什么

main.cu

#include "gameoflife.cu"

#include <iostream>

using namespace std;

#define DIM 20

__global__ void Func(CudaMem<GameOfLife<DIM, DIM> > * golDevice)
{
    golDevice->m_t->Step();
}

int main()
{
    GameOfLife<DIM, DIM> golHost;

    golHost.CreateCell(3,3);
    golHost.CreateCell(4,3);
    golHost.CreateCell(5,3);
    golHost.CreateCell(5,2);
    golHost.CreateCell(4,1);

    golHost.CreateCell(1,7);
    golHost.CreateCell(2,7);
    golHost.CreateCell(3,7);

    cout << golHost << endl;

    CudaMem<GameOfLife<DIM, DIM> > golDevice;
//  GameOfLife<DIM, DIM>* golDevice;
//  cudaMalloc((void **)&golDevice, sizeof(GameOfLife<DIM, DIM>));

    CudaMem<GameOfLife<DIM, DIM> >::CudaMemcpyHostToDevice(golDevice, &golHost);
//  cudaMemcpy(golDevice, &golHost, sizeof(GameOfLife<DIM, DIM>), cudaMemcpyHostToDevice);

    for(int i = 0 ; i < 1 ; ++i)
    {

//      cout << "Press anykey to exit.";
//      cin.ignore();
//      cin.get();

        Func<<<DIM, DIM>>>(&golDevice);

        CudaMem<GameOfLife<DIM, DIM> >::CudaMemcpyDeviceToHost(golDevice, &golHost);
//      cudaMemcpy(&golHost, golDevice, sizeof(GameOfLife<DIM, DIM>), cudaMemcpyDeviceToHost);

        cout << golHost << endl;

    }

//  cudaFree(golDevice);

    cudaDeviceReset();


}

gameoflife.cu

#include <cuda_runtime.h>
#include "cudaT.h"
#include <cstddef>
#include <iostream>

using namespace std;

template<size_t ROWS, size_t COLUMNS>
class GameOfLife
{
public:
    __host__ GameOfLife()
    {
        memset(m_dots, 0, sizeof(m_dots));
        memset(m_dots, 0, sizeof(m_temp));
    }


    __host__ __device__ ~GameOfLife()
    {

    }

    __host__ void CreateCell(size_t _row, size_t _column)
    {
        // need to check overflow
        m_dots[_row + 1][_column + 1] = 1;
    }

    __host__ void KillCell(size_t _row, size_t _column)
    {
        // need to check overflow
        m_dots[_row + 1][_column + 1] = 0;
    }

    __device__ void Step()
    {
        int liveNeighbours = 0;

        if(threadIdx.x > ROWS || blockIdx.x > COLUMNS )
        {
            return;
        }

        m_temp[threadIdx.x + 1][blockIdx.x + 1] = m_dots[threadIdx.x + 1][blockIdx.x + 1];

        __syncthreads();

        liveNeighbours = CalcLiveNeighbours() % 9;

        m_dots[threadIdx.x + 1][blockIdx.x + 1] = ( m_temp[threadIdx.x + 1][blockIdx.x + 1] && (liveNeighbours > 1 && liveNeighbours < 4) )
        || ( !m_temp[threadIdx.x + 1][blockIdx.x + 1] && liveNeighbours == 3) ;

    }

    template<size_t R, size_t C>
    __host__ friend ostream& operator<<(ostream& os, GameOfLife<R, C>& gol);

private:
    __device__ __host__ GameOfLife(const GameOfLife& other);

    __device__ int CalcLiveNeighbours()
    {
        return  m_temp[threadIdx.x + 0][blockIdx.x + 0] +
                m_temp[threadIdx.x + 0][blockIdx.x + 1] +
                m_temp[threadIdx.x + 0][blockIdx.x + 2] +
                m_temp[threadIdx.x + 1][blockIdx.x + 0] +
                m_temp[threadIdx.x + 1][blockIdx.x + 2] +
                m_temp[threadIdx.x + 2][blockIdx.x + 0] +
                m_temp[threadIdx.x + 2][blockIdx.x + 1] +
                m_temp[threadIdx.x + 2][blockIdx.x + 2]
        ;
    }

    int m_temp[ROWS + 2][COLUMNS + 2];
    int m_dots[ROWS + 2][COLUMNS + 2];
};

template<size_t R, size_t C>
__host__ ostream& operator<<(ostream& os, GameOfLife<R, C>& gol)
{
    for(int i = 1 ; i < (R+1) ; ++i)
    {
        for(int j = 1 ; j < (C+1) ; ++j)
        {
            os << gol.m_dots[i][j] << " ";
        }
        os << endl;
    }

    return os;
}

cudaT

#pragma once

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>

#include "cudaException.h"

#include <iostream>
using namespace std;

template<typename T>
class CudaMem
{
public:

    CudaMem():m_t(0)
    {
        cudaError_t err = cudaSuccess;
        err = cudaMalloc((void **)&m_t, sizeof(T));

        if (err != cudaSuccess)
        {
            throw CudaException(err);
        }

    }

    // CudaMem(T& copyFrom):m_t(0)

    explicit CudaMem(const CudaMem<T>& other):m_t(0)
    {
        cout << "CudaMem copy ctor" << endl;
        cudaError_t err = cudaSuccess;

        err = cudaMalloc((void **)&m_t, sizeof(T));

        if (err != cudaSuccess)
        {
            throw CudaException(err);
        }
        // need to copy the memory
    }

    static void CudaMemcpyHostToDevice(CudaMem<T>& _deviceMem, T* _hostMem)
    {
        cudaError_t err = cudaSuccess;

        err = cudaMemcpy(_deviceMem.m_t, _hostMem, sizeof(T), cudaMemcpyHostToDevice);

        if(err != cudaSuccess)
        {
            throw CudaException(err);
        }
    }

    static void CudaMemcpyDeviceToHost(CudaMem<T>& _deviceMem, T* _hostMem)
    {
        cudaError_t err = cudaSuccess;
        err = cudaMemcpy(_hostMem, _deviceMem.m_t, sizeof(T), cudaMemcpyDeviceToHost);

        if (err != cudaSuccess)
        {
            throw CudaException(err);
        }
    }


    ~CudaMem()
    {
        cout << "CudaMem dtor" << endl;
        cudaFree(m_t);
    }

    const T* Get()
    {
        return m_t;
    }

    operator void*()
    {
        return m_t;
    }

    operator T*()
    {
        return m_t;
    }

    T* operator->()
    {
        return m_t;
    }

    T* const m_t;
private:

};

cudaException.h

#pragma once

#include <string>
#include <sstream>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

using std::string;
using std::stringstream;

class CudaException : public std::exception
{
public:
    CudaException(cudaError_t _err):m_str(cudaGetErrorString(_err))
    {

    }

    CudaException(cudaError_t _err, string _file, int _line):m_str(cudaGetErrorString(_err))
    {
        string s;
        stringstream out;
        out << _line;
        s = out.str();
        m_str += " At file: " + _file + " At line: " + s;
    }

    virtual const char* what() const throw ()
    {
        return m_str.c_str();
    }

    virtual ~CudaException() throw (){}

private:
    string m_str;
};

通常，我想做的是将cudaMalloc \ cudaFree包装在一个类中。
当我不使用此类CudaMem时，一切都很好。

最佳答案

问题在于此函数调用：

Func<<<DIM, DIM>>>(&golDevice);