c++ - 简单的Thrust代码的执行速度是我的朴素cuda内核的一半。我使用Thrust错误吗？

我对Cuda和Thrust还是很陌生，但是我的印象是，如果使用得当，Thrust应该比单纯编写的Cuda内核提供更好的性能。我是否以次佳的方式使用Thrust？下面是一个完整的最小示例，该示例采用长度为u的数组N+2，并且为i和1之间的每个N计算平均值0.5*(u[i-1] + u[i+1])并将结果放入uNew[i]。（uNew[0]设置为u[0]，u[N+1]设置为u[N+1]，以便边界项不变）。该代码会多次平均执行此操作，以获得合理的时间进行时序测试。在我的硬件上，Thrust计算所需的时间大约是朴素代码的两倍。有没有办法改善我的推力代码？

#include <iostream>
#include <thrust/device_vector.h>
#include <boost/timer.hpp>
#include <thrust/device_malloc.h>

typedef double numtype;

template <typename T> class NeighborAverageFunctor{
	int N;
public:
	NeighborAverageFunctor(int _N){
		N = _N;
	}
	template <typename Tuple>
	__host__ __device__ void operator()(Tuple t){
		T uL = thrust::get<0>(t);
		T uR = thrust::get<1>(t);

		thrust::get<2>(t) = 0.5*(uL + uR);
	}

	int getN(){
		return N;
	}
};

template <typename T> void thrust_sweep(thrust::device_ptr<T> u, thrust::device_ptr<T> uNew, NeighborAverageFunctor<T>& op){
	int N = op.getN();
	thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(u, u + 2, uNew + 1)), thrust::make_zip_iterator(thrust::make_tuple(u + N, u + N+2, uNew + N+1)), op);
	// Propagate boundary values without changing them
	uNew[0] = u[0];
	uNew[N+1] = u[N+1];
}


template <typename T> __global__ void initialization_kernel(int n, T* u){
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(i < n+2){
		if(i == 0){
			u[i] = 1.0;
		}
		else{
			u[i] = 0.0;
		}
	}
}

template <typename T> __global__ void sweep_kernel(int n, T, T* u, T* uNew){
	const int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i >= 1 && i < n-1){
		uNew[i] = 0.5*(u[i+1] + u[i-1]);
	}
	else if(i == 0 || i == n+1){
		uNew[i] = u[i];
	}
}

int main(void){
	int sweeps = 2000;
	int N = 4096*2048;
	numtype h = 1.0/N;
	numtype hSquared = pow(h, 2);

	NeighborAverageFunctor<numtype> op(N);

	thrust::device_ptr<numtype> u_d = thrust::device_malloc<numtype>(N+2);
	thrust::device_ptr<numtype> uNew_d = thrust::device_malloc<numtype>(N+2);
	thrust::device_ptr<numtype> uTemp_d;

	thrust::fill(u_d, u_d + (N+2), 0.0);
	u_d[0] = 1.0;

	boost::timer::timer timer1;

	for(int k = 0; k < sweeps; k++){
		thrust_sweep<numtype>(u_d, uNew_d, op);
		uTemp_d = u_d;
		u_d = uNew_d;
		uNew_d = uTemp_d;
	}

	double thrust_time = timer1.elapsed();

	thrust::host_vector<numtype> u_h(N+2);
	thrust::copy(u_d, u_d + N+2, u_h.begin());
	for(int i = 0; i < 10; i++){
		std::cout << u_h[i] << " ";
	}
	std::cout << std::endl;

	thrust::device_free(u_d);
	thrust::device_free(uNew_d);

	numtype * u_raw_d, * uNew_raw_d, * uTemp_raw_d;
	cudaMalloc(&u_raw_d, (N+2)*sizeof(numtype));
	cudaMalloc(&uNew_raw_d, (N+2)*sizeof(numtype));

	numtype * u_raw_h = (numtype*)malloc((N+2)*sizeof(numtype));

	int block_size = 256;
	int grid_size = ((N+2) + block_size - 1) / block_size;

	initialization_kernel<numtype><<<grid_size, block_size>>>(N, u_raw_d);

	boost::timer::timer timer2;

	for(int k = 0; k < sweeps; k++){
		sweep_kernel<numtype><<<grid_size, block_size>>>(N+2, hSquared, u_raw_d, uNew_raw_d);
		uTemp_raw_d = u_raw_d;
		u_raw_d = uNew_raw_d;
		uNew_raw_d = uTemp_raw_d;
	}

	double raw_time = timer2.elapsed();

	cudaMemcpy(u_raw_h, u_raw_d, (N+2)*sizeof(numtype), cudaMemcpyDeviceToHost);

	for(int i = 0; i < 10; i++){
		std::cout << u_raw_h[i] << " ";
	}
	std::cout << std::endl;

	std::cout << "Thrust: " << thrust_time << " s" << std::endl;
	std::cout << "Raw: " << raw_time << " s" << std::endl;

	free(u_raw_h);

	cudaFree(u_raw_d);
	cudaFree(uNew_raw_d);

	return 0;
}

最佳答案

根据我的测试，这些行：

uNew[0] = u[0];
uNew[N+1] = u[N+1];

相对于内核方法，正在破坏您的推力性能。当我消除它们时，结果似乎没有什么不同。与内核处理边界情况相比，推力代码使用了非常昂贵的方法（cudaMemcpy操作，在后台）执行边界处理。

由于推力函子实际上从未写入边界位置，因此仅一次写入这些值而不是循环写入就足够了。

通过更好地处理边界情况，可以显着提高推力性能。

关于c++ - 简单的Thrust代码的执行速度是我的朴素cuda内核的一半。我使用Thrust错误吗？，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/29421482/