


I have following simple CUDA-Thrust code which adds 10 to device vector but the function is getting called on host side instead of device.

#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <stdio.h>
#include <thrust/device_vector.h>

__host__ __device__ int add(int x){
    #if defined(__CUDA_ARCH__)
     printf("In device\n");
     printf("In host\n");

    return x+10;

int main(void)
    thrust::host_vector<int> H(4);
    H[0] = H[1] = H[2] = H[3] = 10;

    thrust::device_vector<int> data=H;

    std::transform(data.begin(), data.end(), data.begin(),add);
    return 0;





It looks like you have several issues, some already pointed out.

  1. 如果要使用推力,应使用 thrust :: transform 而不是 std :: transform std :: transform 不了解GPU或CUDA或推力,并将分派 add 的主机版本功能。我不确定当您将 thrust :: device_vector 传递给它时究竟会做什么。

  1. If you want to use thrust, you should use thrust::transform, not std::transform. std::transform has no knowledge of the GPU or CUDA or thrust, and will dispatch the host version of your add function. I'm not sure what that would do exactly when you pass a thrust::device_vector to it.

推力算法需要使用函数对象(函子),而不要使用裸露的CUDA __ device __ 函数,原因是Jared指出的原因(源代码中的推力算法实际上是主机代码该主机代码无法发现裸 __ device __ 函数的地址)。借助此修复程序,您可以确定在处理设备向量时推力将调度设备代码路径。

Thrust algorithms need to use function objects (functors) rather than bare CUDA __device__ functions, for the reason indicated by Jared (the thrust algorithm in your source code is actually host code. That host code cannot discover the address of a bare __device__ function). With this fix, you can be pretty certain that thrust will dispatch the device code path when working on device vectors.


$ cat t856.cu
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>

struct my_func {

__host__ __device__
  int operator()(int x){
    #if defined(__CUDA_ARCH__)
     printf("In device, x is %d\n", x);
     printf("In host, x is %d\n", x);

    return x+10;

int main(void)
    thrust::host_vector<int> H(4);
    H[0] = H[1] = H[2] = H[3] = 10;

    thrust::device_vector<int> data=H;

    thrust::transform(data.begin(), data.end(), data.begin(),my_func());
    return 0;
$ nvcc -o t856 t856.cu
$ ./t856
In device, x is 10
In device, x is 10
In device, x is 10
In device, x is 10


09-10 22:16