验证4个SMSP是否是串行访问ShareMemory的
原以为4个smsp中的warp在没有bank冲突的情况下,是可以并行访问共享内存的
通过下面的测试发现,其实是串行的,share memory每个cycle只能处理一个请求
测试过程
tee shm_kernel.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
__global__ void shm_kernel(float *input,float *output) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float shm_data[0xc000/4];
float vals;
clock_t t0=clock64();
vals=shm_data[tid];
__syncthreads();
clock_t t1=clock64();
vals*=(tid);
output[tid]=vals;
if(tid==0)
{
printf("ts:%lld\n",t1-t0);
}
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx shm_kernel.cu -o shm_kernel.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 shm_kernel.ptx -cubin -o shm_kernel.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 shm_kernel.cubin -fatbin -o shm_kernel.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass shm_kernel.fatbin
tee shm_kernel_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
int main(int argc,char *argv[])
{
CUresult error;
CUdevice cuDevice;
cuInit(0);
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
}
CUcontext cuContext;
error = cuCtxCreate(&cuContext, 0, cuDevice);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in create context!\n");
}
CUmodule module;
CUfunction function;
const char* module_file = "shm_kernel.fatbin";
const char* kernel_name = "_Z10shm_kernelPfS_";
error = cuModuleLoad(&module, module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
}
error = cuModuleGetFunction(&function, module, kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get function error!\n");
}
int thread_size_conf[3]={32,32*4,32*4*4};
for(int k=0;k<3;k++)
{
int block_size=1;
int thread_size=thread_size_conf[k];
int data_size=sizeof(float)*thread_size*block_size;
float *output_ptr=nullptr;
float *input_ptr=nullptr;
int cudaStatus=0;
cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
if(cudaStatus)
{
printf("cudaMalloc input_ptr Failed\n");
}
cudaStatus= cudaMalloc((void**)&output_ptr, data_size);
if(cudaStatus)
{
printf("cudaMalloc output_ptr Failed\n");
}
void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};
auto ret=cuLaunchKernel(function,
block_size, 1, 1,
thread_size, 1, 1,
0,0,kernelParams, 0);
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",cudaGetErrorString(cudaerr));
cudaFree(output_ptr);
cudaFree(input_ptr);
}
cuModuleUnload(module);
cuCtxDestroy(cuContext);
return 0;
}
EOF
g++ shm_kernel_main.cpp -o shm_kernel_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,\
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum,\
smsp__sass_inst_executed_op_shared_ld.sum,\
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained,\
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained,\
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum ./shm_kernel_main
输出
ts:33
ts:1551
0%....50%....100% - 3 passes
==PROF== Profiling "shm_kernel(float *, float *)" - 1: ts:39
ts:39
ts:1622
0%....50%....100% - 3 passes
==PROF== Profiling "shm_kernel(float *, float *)" - 2: ts:64
ts:57
ts:1706
0%....50%....100% - 3 passes
==PROF== Disconnected from process 657443
[657443] shm_kernel_main@127.0.0.1
shm_kernel(float *, float *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
---------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
---------------------------------------------------------------------- ----------- ------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained 1/cycle 1
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum 1
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained 1/cycle 28
smsp__sass_inst_executed_op_shared_ld.sum inst 1
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum 0
---------------------------------------------------------------------- ----------- ------------
shm_kernel(float *, float *) (1, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
---------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
---------------------------------------------------------------------- ----------- ------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained 1/cycle 1
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum 4
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained 1/cycle 28
smsp__sass_inst_executed_op_shared_ld.sum inst 4
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum 0
---------------------------------------------------------------------- ----------- ------------
shm_kernel(float *, float *) (1, 1, 1)x(512, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
---------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
---------------------------------------------------------------------- ----------- ------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained 1/cycle 1
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum 16
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained 1/cycle 28
smsp__sass_inst_executed_op_shared_ld.sum inst 16
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum 0
---------------------------------------------------------------------- ----------- ------------