在cudaT.h中调用cudaMemcpy时出现未知错误。
编译时收到此警告,在第46、50、52行重复了多次:
./gameoflife.cu(46):警告:假设全局内存空间不足,无法告诉指针指向什么
main.cu
#include "gameoflife.cu"
#include <iostream>
using namespace std;
#define DIM 20
__global__ void Func(CudaMem<GameOfLife<DIM, DIM> > * golDevice)
{
golDevice->m_t->Step();
}
int main()
{
GameOfLife<DIM, DIM> golHost;
golHost.CreateCell(3,3);
golHost.CreateCell(4,3);
golHost.CreateCell(5,3);
golHost.CreateCell(5,2);
golHost.CreateCell(4,1);
golHost.CreateCell(1,7);
golHost.CreateCell(2,7);
golHost.CreateCell(3,7);
cout << golHost << endl;
CudaMem<GameOfLife<DIM, DIM> > golDevice;
// GameOfLife<DIM, DIM>* golDevice;
// cudaMalloc((void **)&golDevice, sizeof(GameOfLife<DIM, DIM>));
CudaMem<GameOfLife<DIM, DIM> >::CudaMemcpyHostToDevice(golDevice, &golHost);
// cudaMemcpy(golDevice, &golHost, sizeof(GameOfLife<DIM, DIM>), cudaMemcpyHostToDevice);
for(int i = 0 ; i < 1 ; ++i)
{
// cout << "Press anykey to exit.";
// cin.ignore();
// cin.get();
Func<<<DIM, DIM>>>(&golDevice);
CudaMem<GameOfLife<DIM, DIM> >::CudaMemcpyDeviceToHost(golDevice, &golHost);
// cudaMemcpy(&golHost, golDevice, sizeof(GameOfLife<DIM, DIM>), cudaMemcpyDeviceToHost);
cout << golHost << endl;
}
// cudaFree(golDevice);
cudaDeviceReset();
}
gameoflife.cu
#include <cuda_runtime.h>
#include "cudaT.h"
#include <cstddef>
#include <iostream>
using namespace std;
template<size_t ROWS, size_t COLUMNS>
class GameOfLife
{
public:
__host__ GameOfLife()
{
memset(m_dots, 0, sizeof(m_dots));
memset(m_dots, 0, sizeof(m_temp));
}
__host__ __device__ ~GameOfLife()
{
}
__host__ void CreateCell(size_t _row, size_t _column)
{
// need to check overflow
m_dots[_row + 1][_column + 1] = 1;
}
__host__ void KillCell(size_t _row, size_t _column)
{
// need to check overflow
m_dots[_row + 1][_column + 1] = 0;
}
__device__ void Step()
{
int liveNeighbours = 0;
if(threadIdx.x > ROWS || blockIdx.x > COLUMNS )
{
return;
}
m_temp[threadIdx.x + 1][blockIdx.x + 1] = m_dots[threadIdx.x + 1][blockIdx.x + 1];
__syncthreads();
liveNeighbours = CalcLiveNeighbours() % 9;
m_dots[threadIdx.x + 1][blockIdx.x + 1] = ( m_temp[threadIdx.x + 1][blockIdx.x + 1] && (liveNeighbours > 1 && liveNeighbours < 4) )
|| ( !m_temp[threadIdx.x + 1][blockIdx.x + 1] && liveNeighbours == 3) ;
}
template<size_t R, size_t C>
__host__ friend ostream& operator<<(ostream& os, GameOfLife<R, C>& gol);
private:
__device__ __host__ GameOfLife(const GameOfLife& other);
__device__ int CalcLiveNeighbours()
{
return m_temp[threadIdx.x + 0][blockIdx.x + 0] +
m_temp[threadIdx.x + 0][blockIdx.x + 1] +
m_temp[threadIdx.x + 0][blockIdx.x + 2] +
m_temp[threadIdx.x + 1][blockIdx.x + 0] +
m_temp[threadIdx.x + 1][blockIdx.x + 2] +
m_temp[threadIdx.x + 2][blockIdx.x + 0] +
m_temp[threadIdx.x + 2][blockIdx.x + 1] +
m_temp[threadIdx.x + 2][blockIdx.x + 2]
;
}
int m_temp[ROWS + 2][COLUMNS + 2];
int m_dots[ROWS + 2][COLUMNS + 2];
};
template<size_t R, size_t C>
__host__ ostream& operator<<(ostream& os, GameOfLife<R, C>& gol)
{
for(int i = 1 ; i < (R+1) ; ++i)
{
for(int j = 1 ; j < (C+1) ; ++j)
{
os << gol.m_dots[i][j] << " ";
}
os << endl;
}
return os;
}
cudaT
#pragma once
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include "cudaException.h"
#include <iostream>
using namespace std;
template<typename T>
class CudaMem
{
public:
CudaMem():m_t(0)
{
cudaError_t err = cudaSuccess;
err = cudaMalloc((void **)&m_t, sizeof(T));
if (err != cudaSuccess)
{
throw CudaException(err);
}
}
// CudaMem(T& copyFrom):m_t(0)
explicit CudaMem(const CudaMem<T>& other):m_t(0)
{
cout << "CudaMem copy ctor" << endl;
cudaError_t err = cudaSuccess;
err = cudaMalloc((void **)&m_t, sizeof(T));
if (err != cudaSuccess)
{
throw CudaException(err);
}
// need to copy the memory
}
static void CudaMemcpyHostToDevice(CudaMem<T>& _deviceMem, T* _hostMem)
{
cudaError_t err = cudaSuccess;
err = cudaMemcpy(_deviceMem.m_t, _hostMem, sizeof(T), cudaMemcpyHostToDevice);
if(err != cudaSuccess)
{
throw CudaException(err);
}
}
static void CudaMemcpyDeviceToHost(CudaMem<T>& _deviceMem, T* _hostMem)
{
cudaError_t err = cudaSuccess;
err = cudaMemcpy(_hostMem, _deviceMem.m_t, sizeof(T), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
throw CudaException(err);
}
}
~CudaMem()
{
cout << "CudaMem dtor" << endl;
cudaFree(m_t);
}
const T* Get()
{
return m_t;
}
operator void*()
{
return m_t;
}
operator T*()
{
return m_t;
}
T* operator->()
{
return m_t;
}
T* const m_t;
private:
};
cudaException.h
#pragma once
#include <string>
#include <sstream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using std::string;
using std::stringstream;
class CudaException : public std::exception
{
public:
CudaException(cudaError_t _err):m_str(cudaGetErrorString(_err))
{
}
CudaException(cudaError_t _err, string _file, int _line):m_str(cudaGetErrorString(_err))
{
string s;
stringstream out;
out << _line;
s = out.str();
m_str += " At file: " + _file + " At line: " + s;
}
virtual const char* what() const throw ()
{
return m_str.c_str();
}
virtual ~CudaException() throw (){}
private:
string m_str;
};
通常,我想做的是将cudaMalloc \ cudaFree包装在一个类中。
当我不使用此类CudaMem时,一切都很好。
最佳答案
问题在于此函数调用:
Func<<<DIM, DIM>>>(&golDevice);
我发送了主机内存指针而不是设备内存指针。