0_Simple__simpleStreams

对比使用单流和多流（4条）情况下数据拷贝，以及数据拷贝加内核调用的效率差别。
▶ 源代码

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 // 默认使用 windows64 系统，使用 64-bit 目标代码，码删掉了对其他系统的支持

 #define MEMORY_ALIGNMENT    4096                                // 内存对齐到 4KB

 #define ALIGN_UP(x,size)    (((size_t)x+(size-1))&(~(size-1)) ) // x 除以 size 向上取整

 __global__ void init_array(int *g_data, int *factor, int num_iterations)

 {

     int idx = blockIdx.x * blockDim.x + threadIdx.x;

     for (int i = ; i < num_iterations; i++)

         g_data[idx] += *factor;

 }

 bool check(int *a, const int nArray, const int c)

 {

     for (int i = ; i < nArray; i++)

     {

         if (a[i] != c)

         {

             printf("\nArray\tError at i = %d, %d, %d\n", i, a[i], c);

             return false;

         }

     }

     return true;

 }

 inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nByte)

 {

     if (bPinGenericMemory)// 申请原生页对齐锁定内存

     {

         printf("\nVirtualAlloc(), %4.2f MB (generic page-aligned system memory)\n", (float)nByte/1048576.0f);

         *pp_a = (int *) VirtualAlloc(NULL, (nByte + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);

         *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

         cudaHostRegister(*ppAligned_a, nByte, cudaHostRegisterMapped);  // 页锁定内存，异步拷贝必需

     }

     else

     {

         printf("\ncudaMallocHost(), %4.2f MB\n", (float)nByte/1048576.0f);

         cudaMallocHost((void **)pp_a, nByte);                           // 申请时已经页锁定

         *ppAligned_a = *pp_a;

     }

 }

 int main()// 使用默认参数，不再从命令行中获取参数

 {

     printf("\n\tStart\n");

     int nreps = ;        // 核函数测试次数

     int niterations = ;    // 核函数中的重复次数

     int nstreams = ;       // 使用的流数

     float elapsed_time;

     bool bPinGenericMemory;

     cudaSetDevice();// 删掉了筛选设备的过程

     cudaDeviceProp deviceProp;

     cudaGetDeviceProperties(&deviceProp, );

     if (deviceProp.canMapHostMemory)// 检查 GPU 是否支持主机内存映射，否则原生内存还是不能用

         bPinGenericMemory = true;

     else

     {

         printf("\nDevice not support mapping of generic host memory, use cudaMallocHost() instead\n");

         bPinGenericMemory = false;

     }

     // 流处理器个数不足 32 时降低测试负载（源代码没有减少 nByte 的大小，已改进）

     float scale_factor = max(32.0f / float(_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount), 1.0f);

     int nArray = (int)rint((float) *  *  / scale_factor); // 测试数组元素个数

     int nByte = nArray * sizeof(int);                               // 测试数组内存大小

     printf("\nWorkload *= %1.4f, array_size = %d\n", 1.0f / scale_factor, nArray);

     cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | (bPinGenericMemory ? cudaDeviceMapHost : ));// 使用线程块同步，减少 CPU 的使用

     int *h_a = , *hAligned_a = ;

     AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nByte);// 使用设定的方式申请内存

     int c = , *d_a = , *d_c = ;

     cudaMalloc((void **)&d_a, nByte);

     cudaMemset(d_a, 0x0, nByte);

     cudaMalloc((void **)&d_c, sizeof(int));

     cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice);

     cudaEvent_t start_event, stop_event;

     cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);

     cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);

     cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));

     for (int i = ; i < nstreams; i++)

         cudaStreamCreate(&(streams[i]));

     printf("\n\tStart test\n");

     // 异步拷贝测试

     cudaEventRecord(start_event, );

     cudaMemcpyAsync(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost, streams[]);

     cudaEventRecord(stop_event, );

     cudaEventSynchronize(stop_event);

     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);

     printf("memcopy:\t%.2f\n", elapsed_time);

     // 核函数测试

     dim3 threads = dim3();

     dim3 blocks = dim3(nArray / threads.x);

     cudaEventRecord(start_event, );

     init_array << <blocks, threads, , streams[] >> > (d_a, d_c, niterations);

     cudaEventRecord(stop_event, );

     cudaEventSynchronize(stop_event);

     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);

     printf("kernel:\t\t%.2f\n", elapsed_time);

     // 串行测试

     cudaEventRecord(start_event, );

     for (int k = ; k < nreps; k++)

     {

         init_array << <blocks, threads >> > (d_a, d_c, niterations);

         cudaMemcpy(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost);

     }

     cudaEventRecord(stop_event, );

     cudaEventSynchronize(stop_event);

     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);

     printf("non-streamed:\t%.2f\n", elapsed_time / nreps);

     // 多流测试

     blocks = dim3(nArray / (nstreams*threads.x), );

     memset(hAligned_a, , nByte);

     cudaMemset(d_a, , nByte);

     cudaEventRecord(start_event, );

     for (int k = ; k < nreps; k++)     // 分流给出内核函数和数据回传工作

     {

         for (int i = ; i < nstreams; i++)

             init_array << <blocks, threads, , streams[i] >> > (d_a + i *nArray / nstreams, d_c, niterations);

         for (int i = ; i < nstreams; i++)

             cudaMemcpyAsync(hAligned_a + i * nArray / nstreams, d_a + i * nArray / nstreams, nByte / nstreams, cudaMemcpyDeviceToHost, streams[i]);

     }

     cudaEventRecord(stop_event, );

     cudaEventSynchronize(stop_event);

     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);

     printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);

     // 检查结果和回收工作

     printf("\n\tResult: %s\n", check(hAligned_a, nArray, c*nreps*niterations)?"Passed":"Failed");

     cudaFree(d_a);

     cudaFree(d_c);

     if (bPinGenericMemory)

     {

         cudaHostUnregister(hAligned_a);

         VirtualFree(h_a, , MEM_RELEASE);

     }

     else

         cudaFreeHost(h_a);

     cudaEventDestroy(start_event);

     cudaEventDestroy(stop_event);

     for (int i = ; i < nstreams; i++)

         cudaStreamDestroy(streams[i]);

     getchar();

     return ;

 }

▶ 输出结果

    Start

Workload *= 1.0000, array_size =

VirtualAlloc(), 64.00 MB (generic page-aligned system memory)

    Start test

memcopy:        5.34

kernel:         5.15

non-streamed:   9.95

 streams:      5.24

    Result: Passed

▶ 涨姿势

● 涉及的宏和内部函数原型

 // driver types.h

 #define cudaStreamPerThread                 ((cudaStream_t)0x2)

 #define cudaEventDefault                    0x00  // Default event flag

 #define cudaEventBlockingSync               0x01  // Event uses blocking synchronization

 #define cudaEventDisableTiming              0x02  // Event will not record timing data

 #define cudaEventInterprocess               0x04  // Event is suitable for interprocess use. cudaEventDisableTiming must be set

 #define cudaDeviceScheduleAuto              0x00  // Device flag - Automatic scheduling

 #define cudaDeviceScheduleSpin              0x01  // Device flag - Spin default scheduling

 #define cudaDeviceScheduleYield             0x02  // Device flag - Yield default scheduling

 #define cudaDeviceScheduleBlockingSync      0x04  // Device flag - Use blocking synchronization

 #define cudaDeviceBlockingSync              0x04  // Device flag - Use blocking synchronization

                                                      deprecated This flag was deprecated as of CUDA 4.0 and

                                                      replaced with ::cudaDeviceScheduleBlockingSync.

 #define cudaDeviceScheduleMask              0x07  // Device schedule flags mask

 #define cudaDeviceMapHost                   0x08  // Device flag - Support mapped pinned allocations

 #define cudaDeviceLmemResizeToMax           0x10  // Device flag - Keep local memory allocation after launch

 #define cudaDeviceMask                      0x1f  // Device flags mask

 #define cudaArrayDefault                    0x00  // Default CUDA array allocation flag

 #define cudaArrayLayered                    0x01  // Must be set in cudaMalloc3DArray to create a layered CUDA array

 #define cudaArraySurfaceLoadStore           0x02  // Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array

 #define cudaArrayCubemap                    0x04  // Must be set in cudaMalloc3DArray to create a cubemap CUDA array

 #define cudaArrayTextureGather              0x08  // Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array

 #define cudaIpcMemLazyEnablePeerAccess      0x01  // Automatically enable peer access between remote devices as needed

 #define cudaMemAttachGlobal                 0x01  // Memory can be accessed by any stream on any device

 #define cudaMemAttachHost                   0x02  // Memory cannot be accessed by any stream on any device

 #define cudaMemAttachSingle                 0x04  // Memory can only be accessed by a single stream on the associated device

 #define cudaOccupancyDefault                0x00  // Default behavior

 #define cudaOccupancyDisableCachingOverride 0x01  // Assume global caching is enabled and cannot be automatically turned off

 #define cudaCpuDeviceId                     ((int)-1) // Device id that represents the CPU

 #define cudaInvalidDeviceId                 ((int)-2) // Device id that represents an invalid device

 // cuda_runtime_api.h

 extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags );

 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);

 extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags);

 extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr);

 // memoryapi.h

 WINBASEAPI _Ret_maybenull_ _Post_writable_byte_size_(dwSize) LPVOID WINAPI VirtualAlloc                 \

 (                                                                                                       \

     _In_opt_ LPVOID lpAddress, _In_ SIZE_T dwSize, _In_ DWORD flAllocationType, _In_ DWORD flProtect    \

 );

 WINBASEAPI BOOL WINAPI VirtualFree  \

 (

     _Pre_notnull_ _When_(dwFreeType == MEM_DECOMMIT, _Post_invalid_) _When_(dwFreeType == MEM_RELEASE, _Post_ptr_invalid_) LPVOID lpAddress,

     _In_ SIZE_T dwSize,

     _In_ DWORD dwFreeType

 );

 // winnt.h

 #define PAGE_READWRITE  0x04

 #define MEM_COMMIT      0x1000

 #define MEM_RESERVE     0x2000

● 使用原生页对齐锁定内存的步骤

 #define CEIL(x,y) (((x) - 1) / (y) + 1)

 int sizeByte = sizeof(int) *  *  * ;

 int align = ;

 int *p, *pAlign;

 p= (int *)VirtualAlloc(NULL, (sizeByte + align), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);

 pAlign = (int *)CEIL(*p, align);

 cudaHostRegister(pAlign, sizeByte, cudaHostRegisterMapped);

 ...

 cudaHostUnregister(pAlign);

 VirtualFree(p, , MEM_RELEASE);

● 使用函数 cudaEventCreateWithFlags() 相关来计时，与之前的函数 cudaEventCreate() 稍有不同。

 float elapsed_time = 0.0f;

 cudaEvent_t start_event, stop_event;

 cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);

 cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);

 cudaEventRecord(start_event, );

 ...

 cudaEventRecord(stop_event, );

 cudaEventSynchronize(stop_event);

 cudaEventElapsedTime(&elapsed_time, start_event, stop_event);

 cudaEventDestroy(start_event);

 cudaEventDestroy(stop_event);

cudaEventCreateWithFlags