问题描述
我昨天已经发布了这个问题,但没有很好的收到,虽然我现在有固体的,请承担与我。以下是系统规格:
- Tesla K20m带331.67驱动程序,
- CUDA 6.0,
- Linux机器。
现在我有一个全局内存读应用程序,所以我试图优化它
__ ldg
在我读取全局内存的每个地方的指令。但是, __ ldg
根本没有提高性能,运行时间大约减少了4倍。所以我的问题是,如何用 __ ldg(glob_mem + index)
替换 glob_mem [index]
性能下降?这是我的问题的原始版本,供您重现: MAKE
CPP = g ++
CPPFLAGS = -Wall -O4 -std = c ++ 0x -lcudart -lcurand
LIBDIRS = / usr / local / cuda / lib64
NVCC = nvcc
NVCCINCLUDE = / usr / local / cuda / include
NVCC_COMPILER_FLAGS = -Iinclude / -O4 -arch compute_35 -code sm_35 -c
TARGET = example
.PHONY:所有清除清除
all:$(TARGET)
$(TARGET):kernel.o main.cpp
@echo链接可执行文件$(TARGET)...
@ $(CPP)$(CPPFLAGS)$(addprefix -I,$(NVCCINCLUDE))$(addprefix -L,$(LIBDIRS))-o $ @ $ ^
kernel.o:kernel.cu
@echo编译$ @...
$(NVCC)$(addprefix -I,$(NVCCINCLUDE))$ (NVCC_COMPILER_FLAGS)$< -o $ @
clean:clear
清除:
@echo删除对象文件...
- @ rm -f * .o
purge:clear
@echo删除可执行文件...
- @ rm -f $(TARGET)
main.cpp
#include< ;计时器>
#include< cstdio>
#includekernel.cuh
using namespace std;
int main()
{
auto start = chrono :: high_resolution_clock :: now();
double result = GetResult();
auto elapsed = chrono :: high_resolution_clock :: now() - start;
printf(%。3f,elapsed time:%.3f \\\
,result,(double)chrono :: duration_cast< std :: chrono :: microseconds> ));
return 0;
}
kernel.cuh
$ b
#ifndef kernel_cuh
#define kernel_cuh
#includecuda_runtime.h
#include device_launch_parameters.h
double GetResult();
#endif
kernel.cu / p>
#includekernel.cuh
class DeviceClass
{
double * d_a;
public:
__device__ DeviceClass(double * a)
:d_a(a){}
__device__ void foo(double * b,const int count)
{
int tid = threadIdx.x +(blockDim.x * blockIdx.x);
double result = 0.0;
for(int i = 0; i {
result + = d_a [i];
// result + = __ldg(d_a + i);
}
b [tid] = result;
}
};
__global__ void naive_kernel(double * c,const int count,DeviceClass ** deviceClass)
{
(* deviceClass) - > foo(c,count);
}
__global__ void create_device_class(double * a,DeviceClass ** deviceClass)
{
(* deviceClass)= new DeviceClass(a);
}
double GetResult()
{
const int aSize = 8388608;
const int gridSize = 8;
const int blockSize = 1024;
double * h_a = new double [aSize];
for(int i = 0; i {
h_a [i] = aSize-i;
}
double * d_a;
cudaMalloc((void **)& d_a,aSize * sizeof(double));
cudaMemcpy(d_a,h_a,aSize * sizeof(double),cudaMemcpyHostToDevice);
double * d_b;
cudaMalloc((void **)& d_b,gridSize * blockSize * sizeof(double));
DeviceClass ** d_devicesClasses;
cudaMalloc(& d_devicesClasses,sizeof(DeviceClass **));
create_device_class<<< 1,1>>>>(d_a,d_devicesClasses);
naive_kernel<<< gridSize,blockSize>>>(d_b,aSize,d_devicesClasses);
cudaDeviceSynchronize();
double h_b;
cudaMemcpy(& h_b,d_b,sizeof(double),cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
return h_b;
}
这是什么关系...在我的应用程序中,我有一些全局
- 使用make创建此数据库然后执行./example,
- 运行此示例,得到:35184376283136.000,elapsed time:2054676.000。
- 在kernel.cu和注释掉它的行右上方的结果变成:35184376283136.000,已用时间:3288975.000
- 所以使用__ldg性能相当显着,即使我使用它,没有任何问题在不同的场合。可能的原因是什么?
版本使用 __ ldg
越慢是因为NVCC编译器无法在此特定场景中正确执行循环展开优化。该问题已提交给NVIDIA,ID为1605303. NVIDIA团队最近的回复如下:
I posted this issue already yesterday, but wasnt well received, though I have solid repro now, please bear with me. Here are system specs:
- Tesla K20m with 331.67 driver,
- CUDA 6.0,
- Linux machine.
Now I have a global memory read heavy application therefore I tried to optimize it using __ldg
instruction on every single place where I am reading global memory. However, __ldg
did not improve performance at all, running time decreased roughly 4x. So my question is, how comes that replacing glob_mem[index]
with __ldg(glob_mem + index)
can possibly result into decreased performance? Here is a primitive version of my problem for you to reproduce:
MAKE
CPP=g++
CPPFLAGS=-Wall -O4 -std=c++0x -lcudart -lcurand
LIBDIRS=/usr/local/cuda/lib64
NVCC=nvcc
NVCCINCLUDE=/usr/local/cuda/include
NVCC_COMPILER_FLAGS=-Iinclude/ -O4 -arch compute_35 -code sm_35 -c
TARGET=example
.PHONY: all clear clean purge
all: $(TARGET)
$(TARGET): kernel.o main.cpp
@echo Linking executable "$(TARGET)" ...
@$(CPP) $(CPPFLAGS) $(addprefix -I,$(NVCCINCLUDE)) $(addprefix -L,$(LIBDIRS)) -o $@ $^
kernel.o: kernel.cu
@echo Compiling "$@" ...
$(NVCC) $(addprefix -I,$(NVCCINCLUDE)) $(NVCC_COMPILER_FLAGS) $< -o $@
clean: clear
clear:
@echo Removing object files ...
-@rm -f *.o
purge: clear
@echo Removing executable ...
-@rm -f $(TARGET)
main.cpp
#include <chrono>
#include <cstdio>
#include "kernel.cuh"
using namespace std;
int main()
{
auto start = chrono::high_resolution_clock::now();
double result = GetResult();
auto elapsed = chrono::high_resolution_clock::now() - start;
printf("%.3f, elapsed time: %.3f \n", result, (double)chrono::duration_cast<std::chrono::microseconds>(elapsed).count());
return 0;
}
kernel.cuh
#ifndef kernel_cuh
#define kernel_cuh
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
double GetResult();
#endif
kernel.cu
#include "kernel.cuh"
class DeviceClass
{
double* d_a;
public:
__device__ DeviceClass(double* a)
: d_a(a) {}
__device__ void foo(double* b, const int count)
{
int tid = threadIdx.x + (blockDim.x * blockIdx.x);
double result = 0.0;
for (int i = 0; i < count; ++i)
{
result += d_a[i];
//result += __ldg(d_a + i);
}
b[tid] = result;
}
};
__global__ void naive_kernel(double* c, const int count, DeviceClass** deviceClass)
{
(*deviceClass)->foo(c, count);
}
__global__ void create_device_class(double* a, DeviceClass** deviceClass)
{
(*deviceClass) = new DeviceClass(a);
}
double GetResult()
{
const int aSize = 8388608;
const int gridSize = 8;
const int blockSize = 1024;
double* h_a = new double[aSize];
for (int i = 0; i <aSize; ++i)
{
h_a[i] = aSize - i;
}
double* d_a;
cudaMalloc((void**)&d_a, aSize * sizeof(double));
cudaMemcpy(d_a, h_a, aSize * sizeof(double), cudaMemcpyHostToDevice);
double* d_b;
cudaMalloc((void**)&d_b, gridSize * blockSize * sizeof(double));
DeviceClass** d_devicesClasses;
cudaMalloc(&d_devicesClasses, sizeof(DeviceClass**));
create_device_class<<<1,1>>>(d_a, d_devicesClasses);
naive_kernel<<<gridSize, blockSize>>>(d_b, aSize, d_devicesClasses);
cudaDeviceSynchronize();
double h_b;
cudaMemcpy(&h_b, d_b, sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
return h_b;
}
So what is it all about... In my application I have some global data pointed to by member variable of class DeviceClass which is created on device, exactly as new/delete CUDA demo shows.
- Build this using make and then execute ./example,
- Running this example as is yields: "35184376283136.000, elapsed time: 2054676.000".
- After I uncomment line 17 in kernel.cu and comment out line right above it the result becomes: "35184376283136.000, elapsed time: 3288975.000"
- so using __ldg decreases performance quite significantly even though I was using it up until now without any issues on different occasions. What could be the cause?
The reason for the version using __ldg
being slower is the fact that the NVCC compiler is not able to perform loop unrolling optimizations correctly in this particular scenario. The issue was submitted to NVIDIA with ID 1605303. The most recent response from NVIDIA team is as follows:
这篇关于__ldg在某些情况下导致执行时间较慢的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!