对比使用单流和多流(4条)情况下数据拷贝,以及数据拷贝加内核调用的效率差别。
▶ 源代码

 #include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h> // 默认使用 windows64 系统,使用 64-bit 目标代码,码删掉了对其他系统的支持
#define MEMORY_ALIGNMENT 4096 // 内存对齐到 4KB
#define ALIGN_UP(x,size) (((size_t)x+(size-1))&(~(size-1)) ) // x 除以 size 向上取整 __global__ void init_array(int *g_data, int *factor, int num_iterations)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = ; i < num_iterations; i++)
g_data[idx] += *factor;
} bool check(int *a, const int nArray, const int c)
{
for (int i = ; i < nArray; i++)
{
if (a[i] != c)
{
printf("\nArray\tError at i = %d, %d, %d\n", i, a[i], c);
return false;
}
}
return true;
} inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nByte)
{
if (bPinGenericMemory)// 申请原生页对齐锁定内存
{
printf("\nVirtualAlloc(), %4.2f MB (generic page-aligned system memory)\n", (float)nByte/1048576.0f);
*pp_a = (int *) VirtualAlloc(NULL, (nByte + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
cudaHostRegister(*ppAligned_a, nByte, cudaHostRegisterMapped); // 页锁定内存,异步拷贝必需
}
else
{
printf("\ncudaMallocHost(), %4.2f MB\n", (float)nByte/1048576.0f);
cudaMallocHost((void **)pp_a, nByte); // 申请时已经页锁定
*ppAligned_a = *pp_a;
}
} int main()// 使用默认参数,不再从命令行中获取参数
{
printf("\n\tStart\n");
int nreps = ; // 核函数测试次数
int niterations = ; // 核函数中的重复次数
int nstreams = ; // 使用的流数
float elapsed_time;
bool bPinGenericMemory; cudaSetDevice();// 删掉了筛选设备的过程
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, );
if (deviceProp.canMapHostMemory)// 检查 GPU 是否支持主机内存映射,否则原生内存还是不能用
bPinGenericMemory = true;
else
{
printf("\nDevice not support mapping of generic host memory, use cudaMallocHost() instead\n");
bPinGenericMemory = false;
} // 流处理器个数不足 32 时降低测试负载(源代码没有减少 nByte 的大小,已改进)
float scale_factor = max(32.0f / float(_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount), 1.0f);
int nArray = (int)rint((float) * * / scale_factor); // 测试数组元素个数
int nByte = nArray * sizeof(int); // 测试数组内存大小
printf("\nWorkload *= %1.4f, array_size = %d\n", 1.0f / scale_factor, nArray); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | (bPinGenericMemory ? cudaDeviceMapHost : ));// 使用线程块同步,减少 CPU 的使用 int *h_a = , *hAligned_a = ;
AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nByte);// 使用设定的方式申请内存
int c = , *d_a = , *d_c = ;
cudaMalloc((void **)&d_a, nByte);
cudaMemset(d_a, 0x0, nByte);
cudaMalloc((void **)&d_c, sizeof(int));
cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice);
cudaEvent_t start_event, stop_event;
cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);
cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
for (int i = ; i < nstreams; i++)
cudaStreamCreate(&(streams[i])); printf("\n\tStart test\n");
// 异步拷贝测试
cudaEventRecord(start_event, );
cudaMemcpyAsync(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost, streams[]);
cudaEventRecord(stop_event, );
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
printf("memcopy:\t%.2f\n", elapsed_time); // 核函数测试
dim3 threads = dim3();
dim3 blocks = dim3(nArray / threads.x);
cudaEventRecord(start_event, );
init_array << <blocks, threads, , streams[] >> > (d_a, d_c, niterations);
cudaEventRecord(stop_event, );
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
printf("kernel:\t\t%.2f\n", elapsed_time); // 串行测试
cudaEventRecord(start_event, );
for (int k = ; k < nreps; k++)
{
init_array << <blocks, threads >> > (d_a, d_c, niterations);
cudaMemcpy(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost);
}
cudaEventRecord(stop_event, );
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
printf("non-streamed:\t%.2f\n", elapsed_time / nreps); // 多流测试
blocks = dim3(nArray / (nstreams*threads.x), );
memset(hAligned_a, , nByte);
cudaMemset(d_a, , nByte);
cudaEventRecord(start_event, );
for (int k = ; k < nreps; k++) // 分流给出内核函数和数据回传工作
{
for (int i = ; i < nstreams; i++)
init_array << <blocks, threads, , streams[i] >> > (d_a + i *nArray / nstreams, d_c, niterations);
for (int i = ; i < nstreams; i++)
cudaMemcpyAsync(hAligned_a + i * nArray / nstreams, d_a + i * nArray / nstreams, nByte / nstreams, cudaMemcpyDeviceToHost, streams[i]);
}
cudaEventRecord(stop_event, );
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps); // 检查结果和回收工作
printf("\n\tResult: %s\n", check(hAligned_a, nArray, c*nreps*niterations)?"Passed":"Failed");
cudaFree(d_a);
cudaFree(d_c);
if (bPinGenericMemory)
{
cudaHostUnregister(hAligned_a);
VirtualFree(h_a, , MEM_RELEASE);
}
else
cudaFreeHost(h_a);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
for (int i = ; i < nstreams; i++)
cudaStreamDestroy(streams[i]); getchar();
return ;
}

▶ 输出结果

    Start

Workload *= 1.0000, array_size =

VirtualAlloc(), 64.00 MB (generic page-aligned system memory)

    Start test
memcopy: 5.34
kernel: 5.15
non-streamed: 9.95
streams: 5.24 Result: Passed

▶ 涨姿势

● 涉及的宏和内部函数原型

 // driver types.h
#define cudaStreamPerThread ((cudaStream_t)0x2) #define cudaEventDefault 0x00 // Default event flag
#define cudaEventBlockingSync 0x01 // Event uses blocking synchronization
#define cudaEventDisableTiming 0x02 // Event will not record timing data
#define cudaEventInterprocess 0x04 // Event is suitable for interprocess use. cudaEventDisableTiming must be set #define cudaDeviceScheduleAuto 0x00 // Device flag - Automatic scheduling
#define cudaDeviceScheduleSpin 0x01 // Device flag - Spin default scheduling
#define cudaDeviceScheduleYield 0x02 // Device flag - Yield default scheduling
#define cudaDeviceScheduleBlockingSync 0x04 // Device flag - Use blocking synchronization
#define cudaDeviceBlockingSync 0x04 // Device flag - Use blocking synchronization
deprecated This flag was deprecated as of CUDA 4.0 and
replaced with ::cudaDeviceScheduleBlockingSync.
#define cudaDeviceScheduleMask 0x07 // Device schedule flags mask
#define cudaDeviceMapHost 0x08 // Device flag - Support mapped pinned allocations
#define cudaDeviceLmemResizeToMax 0x10 // Device flag - Keep local memory allocation after launch
#define cudaDeviceMask 0x1f // Device flags mask #define cudaArrayDefault 0x00 // Default CUDA array allocation flag
#define cudaArrayLayered 0x01 // Must be set in cudaMalloc3DArray to create a layered CUDA array
#define cudaArraySurfaceLoadStore 0x02 // Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array
#define cudaArrayCubemap 0x04 // Must be set in cudaMalloc3DArray to create a cubemap CUDA array
#define cudaArrayTextureGather 0x08 // Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array #define cudaIpcMemLazyEnablePeerAccess 0x01 // Automatically enable peer access between remote devices as needed #define cudaMemAttachGlobal 0x01 // Memory can be accessed by any stream on any device
#define cudaMemAttachHost 0x02 // Memory cannot be accessed by any stream on any device
#define cudaMemAttachSingle 0x04 // Memory can only be accessed by a single stream on the associated device #define cudaOccupancyDefault 0x00 // Default behavior
#define cudaOccupancyDisableCachingOverride 0x01 // Assume global caching is enabled and cannot be automatically turned off #define cudaCpuDeviceId ((int)-1) // Device id that represents the CPU
#define cudaInvalidDeviceId ((int)-2) // Device id that represents an invalid device // cuda_runtime_api.h
extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ); extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr); // memoryapi.h
WINBASEAPI _Ret_maybenull_ _Post_writable_byte_size_(dwSize) LPVOID WINAPI VirtualAlloc \
( \
_In_opt_ LPVOID lpAddress, _In_ SIZE_T dwSize, _In_ DWORD flAllocationType, _In_ DWORD flProtect \
); WINBASEAPI BOOL WINAPI VirtualFree \
(
_Pre_notnull_ _When_(dwFreeType == MEM_DECOMMIT, _Post_invalid_) _When_(dwFreeType == MEM_RELEASE, _Post_ptr_invalid_) LPVOID lpAddress,
_In_ SIZE_T dwSize,
_In_ DWORD dwFreeType
); // winnt.h
#define PAGE_READWRITE 0x04
#define MEM_COMMIT 0x1000
#define MEM_RESERVE 0x2000

● 使用原生页对齐锁定内存的步骤

 #define CEIL(x,y) (((x) - 1) / (y) + 1)

 int sizeByte = sizeof(int) *  *  * ;
int align = ;
int *p, *pAlign;
p= (int *)VirtualAlloc(NULL, (sizeByte + align), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
pAlign = (int *)CEIL(*p, align);
cudaHostRegister(pAlign, sizeByte, cudaHostRegisterMapped); ... cudaHostUnregister(pAlign);
VirtualFree(p, , MEM_RELEASE);

● 使用函数 cudaEventCreateWithFlags() 相关来计时,与之前的函数 cudaEventCreate() 稍有不同。

 float elapsed_time = 0.0f;
cudaEvent_t start_event, stop_event;
cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);
cudaEventRecord(start_event, ); ... cudaEventRecord(stop_event, );
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsed_time, start_event, stop_event); cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);

cudaEventCreateWithFlags

05-20 19:25