我正在处理一个优化问题,该问题包含各种形式相似的数学函数,因此我将它们以FunctionObj
进行变形
template <typename T>
struct FunctionObj
{
T a;
FunctionObj(): a(1)
{
}
};
并定义一个
FuncEval
进行评估template <typename T>
__host__ __device__ inline T FuncEval(const FunctionObj<T> &f_obj, T x)
{
return f_obj.a+x;
}
我真正想做的是
sum {func(x)}
,所以我定义了一个FuncEvalF
仿函数来使用thrust::tranform_reduce
template <typename T>
struct FuncEvalF
{
const FunctionObj<T>& f_obj;
__host__ __device__ inline FuncEvalF(const FunctionObj<T>& in_f_obj) :f_obj(in_f_obj)
{
}
__host__ __device__ inline T operator()(T x)
{
return FuncEval(f_obj, x);
}
};
template <typename T>
__host__ __device__ inline T BatchFuncEval(const FunctionObj<T> &f_obj, int size, const T *x_in);
template<>
inline float BatchFuncEval< float>(const FunctionObj<float> &f_obj, int size, const float *x_in)
{
return thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(x_in), thrust::device_pointer_cast(x_in + size), FuncEvalF<float>(f_obj), static_cast<float>(0), thrust::plus<float>());
}
最后在
main.cu
中,我将其称为transform_reduce
auto func = FuncEvalF<float>(FunctionObj<float>());
float result = 0;
try
{
result = thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(dev_a), thrust::device_pointer_cast(dev_a + 10000), func, static_cast<float>(0), thrust::plus<float>());
}
catch (std::exception e)
{
printf("%s in thurst \n ", e.what());
}
异常(exception)是:
bulk_kernel_by_value
,即使我将10000更改为10。只有将FuncEval
的定义更改为,事情才会变得更好。return x;
该程序将输出正确但无意义的答案。我不禁要问我的代码有什么问题?感谢您的关注。
下面的完整代码,CUDA 7.0 sm_20
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <stdio.h>
template <typename T>
struct FunctionObj
{
T a;
FunctionObj(): a(1)
{
}
};
template <typename T>
__host__ __device__ inline T FuncEval(const FunctionObj<T> &f_obj, T x)
{
return f_obj.a+x;
}
template <typename T>
struct FuncEvalF
{
const FunctionObj<T>& f_obj;
__host__ __device__ inline FuncEvalF(const FunctionObj<T>& in_f_obj) :f_obj(in_f_obj)
{
}
__host__ __device__ inline T operator()(T x)
{
return FuncEval(f_obj, x);
}
};
template <typename T>
__host__ __device__ inline T BatchFuncEval(const FunctionObj<T> &f_obj, int size, const T *x_in);
template<>
inline float BatchFuncEval< float>(const FunctionObj<float> &f_obj, int size, const float *x_in)
{
return thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(x_in), thrust::device_pointer_cast(x_in + size), FuncEvalF<float>(f_obj), static_cast<float>(0), thrust::plus<float>());
}
int main()
{
cudaError_t cudaE;
float a[10000] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
float* dev_a;
cudaE = cudaMalloc((void**)(&dev_a), sizeof(float) * 10000);
cudaE = cudaMemcpy(dev_a, a, sizeof(float) * 10000, cudaMemcpyHostToDevice);
auto func = FuncEvalF<float>(FunctionObj<float>());
float result = 0;
try
{
result = thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(dev_a), thrust::device_pointer_cast(dev_a + 10000), func, static_cast<float>(0), thrust::plus<float>());
}
catch (std::exception e)
{
printf("%s in thurst \n ", e.what());
}
printf("the gpu float result is %f\n", result);
cudaFree(dev_a);
}
最佳答案
问题是f_obj
中的struct FuncEvalF
是const FunctionObj<T>&
。
在主机FunctionObj<float>()
上将其实例化为临时文件,但以后对其无效。
解决此问题的一种方法是创建它的副本,而不是保留对其的引用:
template <typename T>
struct FuncEvalF
{
FunctionObj<T> f_obj;
....
}