我正在使用Tesla C1060 GPU,计算能力1.3,Ubuntu 12.04上的Cuda 5。在我的内核中,每个线程都会计算(私有)本地浮点数组locArr[]
的值。
然后,使用var
计算浮点变量locArr[]
的值。
__global__ void gpuKernel
(
float *src, float *out,
size_t memPitchAux, int w
)
{
float locArr[256];
float var=0.0f;
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
int idx = tidy * memPitchAux/4 + tidx;
for(int ic=0; ic<256; ic++)
{
locArr[ic] = 0.0f;
}
for(int is=0; is<255; is++)
{
int ic = fabs(src[tidy*w +tidx]);
locArr[ic] += 1.0f;
}
for(int ic=0; ic<255; ic++)
{
var += locArr[ic];
}
out[idx] = var;
}
每个线程没有足够的寄存器,因此
locArr[]
会溢出到全局内存中。执行此内核的时间约为18毫秒,但如果将locArr[]
乘以零,则该时间将降至1.1毫秒。for(int ic=0; ic<255; ic++)
{
var += locArr[ic] * 0.0f;
}
我不明白为什么,每个线程无论如何都应该读取全局内存中所需的
locArr[]
值,然后将其乘以零。时间应相同。相反,就好像线程已经知道不需要读取数据一样,因为输出始终为零。谁能告诉我发生了什么事?
编辑:如果相反,我有
for(int ic=0; ic<255; ic++)
{
var += locArr[ic] * locArr2[ic];
}
其中,
locArr2[]
是零的本地数组(溢出到全局内存),优化可以在运行时完成吗?编辑2:我的makefile
################################################################################
#
# Makefile project only supported on Mac OSX and Linux Platforms)
#
################################################################################
# OS Name (Linux or Darwin)
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
# Flags to detect 32-bit or 64-bit OS platform
OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/")
OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")
# These flags will override any settings
ifeq ($(i386),1)
OS_SIZE = 32
OS_ARCH = i686
endif
ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif
# Flags to detect either a Linux system (linux) or Mac OSX (darwin)
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
# Location of the CUDA Toolkit binaries and libraries
CUDA_PATH ?= /usr/local/cuda-5.0
CUDA_INC_PATH ?= $(CUDA_PATH)/include
CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
ifneq ($(DARWIN),)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
ifeq ($(OS_SIZE),32)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64
endif
endif
# Common binaries
NVCC ?= $(CUDA_BIN_PATH)/nvcc
GCC ?= g++
# Extra user flags
EXTRA_NVCCFLAGS ?=
EXTRA_LDFLAGS ?=
EXTRA_CCFLAGS ?=
# CUDA code generation flags
GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
# OS-specific build flags
ifneq ($(DARWIN),)
LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -arch $(OS_ARCH)
else
ifeq ($(OS_SIZE),32)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m32
else
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m64
endif
endif
# OS-architecture specific flags
ifeq ($(OS_SIZE),32)
NVCCFLAGS := -m32
else
NVCCFLAGS := -m64
endif
# OpenGL specific libraries
ifneq ($(DARWIN),)
# Mac OSX specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/darwin -L/System/Library/Frameworks/OpenGL.framework/Libraries -framework GLUT -lGL -lGLU ../../common/lib/darwin/libGLEW.a
else
# Linux specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/linux/$(OS_ARCH) -L/usr/X11R6/lib -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW -lrt
endif
# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
TARGET := debug
else
TARGET := release
endif
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc
LDFLAGS += $(LIBPATH_OPENGL)
# Target rules
all: build
build: stackOverflow
stackOverflow.o: stackOverflow.cu
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
stackOverflow: stackOverflow.o
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
mkdir -p ./bin/$(OSLOWER)/$(TARGET)
cp $@ ./bin/$(OSLOWER)/$(TARGET)
run: build
./stackOverflow
clean:
rm -f stackOverflow.o stackOverflow *.pgm
最佳答案
逆风的假设可能是正确的。
我已经分解了原始的__global__
函数和乘以0.f
的版本。我已将256
更改为4
,以便缩短反汇编的代码。如您所见,原始代码的FADD
更改为FFMA
,这可能会更快,并且是您观察到的加速的原因。
原始版本
MOV R1, c[0x1][0x100];
NOP;
ISUB R1, R1, 0x10;
S2R R0, SR_CTAID.Y;
S2R R2, SR_TID.Y;
IMAD R2, R0, c[0x0][0xc], R2;
S2R R3, SR_CTAID.X;
S2R R4, SR_TID.X;
IMAD R3, R3, c[0x0][0x8], R4;
MOV32I R0, 0x4;
IMAD R4, R2, c[0x0][0x38], R3;
IMAD R6.CC, R4, R0, c[0x0][0x20];
IMAD.HI.X R7, R4, R0, c[0x0][0x24];
IMUL.U32.U32 R11.CC, R2, c[0x0][0x30];
LD.E R4, [R6];
STL.64 [R1], RZ;
SHR.U32 R11, R11, 0x2;
STL.64 [R1+0x8], RZ;
IMAD.U32.U32.HI.X R5.CC, R2, c[0x0][0x30], RZ;
F2I.S32.F32.TRUNC R4, |R4|;
ISCADD R9, R4, R1, 0x2;
LDL R4, [R9];
FADD R4, R4, 1;
FADD R4, R4, 1;
FADD R4, R4, 1;
FADD R8, R4, 1;
BFE R4, R2, 0x11f;
STL [R9], R8;
IMAD.U32.U32.X R10, R4, c[0x0][0x30], R5;
LDL R7, [R1];
IMAD.U32.U32 R10, R2, c[0x0][0x34], R10;
LDL R6, [R1+0x4];
ISCADD R8, R10, R11, 0x1e;
LDL.64 R4, [R1+0x8];
IADD R3, R8, R3;
F2F.F32.F32 R2, R7;
FADD R2, R2, R6;
IMAD.U32.U32 R6.CC, R3, R0, c[0x0][0x28];
FADD R2, R2, R4;
IMAD.HI.X R7, R3, R0, c[0x0][0x2c];
FADD R0, R2, R5;
ST.E [R6], R0;
EXIT ;
0
的乘法MOV R1, c[0x1][0x100];
NOP;
ISUB R1, R1, 0x10;
S2R R0, SR_CTAID.Y;
S2R R2, SR_TID.Y;
IMAD R2, R0, c[0x0][0xc], R2;
S2R R3, SR_CTAID.X;
S2R R4, SR_TID.X;
IMAD R3, R3, c[0x0][0x8], R4;
MOV32I R0, 0x4;
IMAD R4, R2, c[0x0][0x38], R3;
IMAD R6.CC, R4, R0, c[0x0][0x20];
IMAD.HI.X R7, R4, R0, c[0x0][0x24];
IMUL.U32.U32 R11.CC, R2, c[0x0][0x30];
LD.E R4, [R6];
STL.64 [R1], RZ;
SHR.U32 R11, R11, 0x2;
STL.64 [R1+0x8], RZ;
IMAD.U32.U32.HI.X R5.CC, R2, c[0x0][0x30], RZ;
F2I.S32.F32.TRUNC R4, |R4|;
ISCADD R9, R4, R1, 0x2;
LDL R4, [R9];
FADD R4, R4, 1;
FADD R4, R4, 1;
FADD R4, R4, 1;
FADD R8, R4, 1;
BFE R4, R2, 0x11f;
STL [R9], R8;
IMAD.U32.U32.X R10, R4, c[0x0][0x30], R5;
LDL R6, [R1];
IMAD.U32.U32 R10, R2, c[0x0][0x34], R10;
LDL R7, [R1+0x4];
ISCADD R8, R10, R11, 0x1e;
LDL.64 R4, [R1+0x8];
IADD R3, R8, R3;
FFMA R2, R6, RZ, RZ;
FFMA R2, R7, RZ, R2;
IMAD.U32.U32 R6.CC, R3, R0, c[0x0][0x28];
FFMA R2, R4, RZ, R2;
IMAD.HI.X R7, R3, R0, c[0x0][0x2c];
FFMA R0, R5, RZ, R2;
ST.E [R6], R0;
EXIT ;
关于c++ - 如果将本地数组乘以零,为什么时序会减少?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/21259216/