cub BlockRadixSort：如何处理大瓦片大小或排序多个瓦片？

本文介绍了cub BlockRadixSort：如何处理大瓦片大小或排序多个瓦片？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

当使用cub :: BlockRadixSort在块内进行排序时，如果元素数量太大，我们如何处理？如果我们将图块大小设置为太大，则临时存储的共享内存将很快无法保存。如果我们将它分成多个图块，我们如何在排序每个图块后对其进行后处理？

解决方案

注意：我不是幼崽专家（远不是）。

您可能想查看此似乎是您可能想要的考虑。但您的问题似乎集中在区块排序。

从我的测试，cub并不真正要求您的原始资料，或放置临时存储的位置。因此，一个可能的解决方案是简单地将您的临时存储器放在全局内存中。为了分析这个，我创建了一个有3个不同测试用例的代码：

测试一个版本的cub块排序，

测试从示例

测试一个从上一个答案派生的cub块排序的版本，其中没有将数据复制到/从全局内存，即。假设数据已经驻留在片上，即在共享内存中。

这些都没有经过广泛测试，因为我建立在崽积木，并测试我的结果在前两种情况下，希望我没有犯任何严重的错误。以下是完整的测试代码，我将在下面提供其他意见：

  $ cat t10.cu 
 #include< ; cub / cub.cuh> 
 #include< stdio.h> 
 #include< stdlib.h> 
 #include< thrust / sort.h> 
 #define nTPB 512 
 #define ELEMS_PER_THREAD 2 
 #define RANGE（nTPB * ELEMS_PER_THREAD）
 #define DSIZE（nTPB * ELEMS_PER_THREAD）
 
 
 
 #define cudaCheckErrors（msg）\ 
 do {\ 
 cudaError_t __err = cudaGetLastError（）; \ 
 if（__err！= cudaSuccess）{\ 
 fprintf（stderr，致命错误：％s（％s在％s：％d）\\\
，\ 
 msg，cudaGetErrorString（__ err），\ 
 __FILE__，__LINE__）; \ 
 fprintf（stderr，*** FAILED  -  ABORTING\\\
）; \ 
 exit（1）; \ 
} \ 
} while（0）
 
使用命名空间cub; 
 // GLOBAL CUB BLOCK SORT KERNEL 
 //专门化BlockRadixSort集合类型
 typedef BlockRadixSort< int，nTPB，ELEMS_PER_THREAD> my_block_sort; 
 __device__ int my_val [DSIZE]; 
 __device__ typename my_block_sort :: TempStorage sort_temp_stg; 
 
 //块排序CUDA内核（nTPB线程每个拥有ELEMS_PER THREAD整数）
 __global__ void global_BlockSortKernel（）
 {
 //集合排序键
 my_block_sort（sort_temp_stg）.Sort（* static_cast  
} 
 
 // ORIGINAL CUB BLOCK SORT KERNEL 
 template< int BLOCK_THREADS，int ITEMS_PER_THREAD> 
 __global__ void BlockSortKernel（int * d_in，int * d_out）
 {
 //专用于BlockLoad，BlockStore和BlockRadixSort集合类型
 typedef cub :: BlockLoad  typedef cub :: BlockStore< int *，BLOCK_THREADS，ITEMS_PER_THREAD，BLOCK_STORE_TRANSPOSE> BlockStoreT; 
 typedef cub :: BlockRadixSort< int，BLOCK_THREADS，ITEMS_PER_THREAD> BlockRadixSortT; 
 //为集合分配类型安全的重用共享内存
 __shared__ union {
 typename BlockLoadT :: TempStorage load; 
 typename BlockStoreT :: TempStorage store; 
 typename BlockRadixSortT :: TempStorage sort; 
} temp_storage; 
 //获取这个块的连续键（跨线程阻塞）
 int thread_keys [ITEMS_PER_THREAD]; 
 int block_offset = blockIdx.x *（BLOCK_THREADS * ITEMS_PER_THREAD）; 
 BlockLoadT（temp_storage.load）.Load（d_in + block_offset，thread_keys）; 
 __syncthreads（）; // Barrier for smem reuse 
 //集中排序键
 BlockRadixSortT（temp_storage.sort）.Sort（thread_keys）; 
 __syncthreads（）; // Barrier for smem reuse 
 //存储排序段
 BlockStoreT（temp_storage.store）.Store（d_out + block_offset，thread_keys）; 
} 
 
 
 
 //共享内存块块排序KERNEL 
 //块排序CUDA内核（nTPB线程每个拥有ELEMS_PER THREAD整数）
 template< int BLOCK_THREADS，int ITEMS_PER_THREAD> 
 __global__ void shared_BlockSortKernel（int * d_out）
 {
 __shared__ int my_val [BLOCK_THREADS * ITEMS_PER_THREAD]; 
 //专门化BlockRadixSort集合类型
 typedef BlockRadixSort< int，BLOCK_THREADS，ITEMS_PER_THREAD> my_block_sort; 
 //为集合分配共享内存
 __shared__ typename my_block_sort :: TempStorage sort_temp_stg; 
 
 //需要扩展ELEMS_PER_THREAD的合成数据> 1 
 my_val [threadIdx.x * ITEMS_PER_THREAD] =（threadIdx.x + 5）; // synth data 
 my_val [threadIdx.x * ITEMS_PER_THREAD + 1] =（threadIdx.x + BLOCK_THREADS + 5）; // synth data 
 __syncthreads（）; 
 // printf（thread％d data =％d\\\
，threadIdx.x，my_val [threadIdx.x * ITEMS_PER_THREAD]）; 
 
 //集中排序键
 my_block_sort（sort_temp_stg）.Sort（* static_cast< int（*）[ITEMS_PER_THREAD]>（static_cast< void *>（my_val + * ITEMS_PER_THREAD））））; 
 __syncthreads（）; 
 
 // printf（thread％d sorted data =％d\\\
，threadIdx.x，my_val [threadIdx.x * ITEMS_PER_THREAD]）; 
 if（threadIdx.x == clock（））{//伪以防止编译器优化
 d_out [threadIdx.x * ITEMS_PER_THREAD] = my_val [threadIdx.x * ITEMS_PER_THREAD]; 
 d_out [threadIdx.x * ITEMS_PER_THREAD + 1] = my_val [threadIdx.x * ITEMS_PER_THREAD + 1];} 
} 
 
 
 int main（）{
 int * h_data，* h_result; 
 cudaEvent_t start，stop; 
 cudaEventCreate（& start）; 
 cudaEventCreate（& stop）; 
 h_data =（int *）malloc（DSIZE * sizeof（int））; 
 h_result =（int *）malloc（DSIZE * sizeof（int））; 
 if（h_data == 0）{printf（malloc fail\\\
）; return 1;} 
 if（h_result == 0）{printf（malloc fail\\\
）; （int i = 0; i  //首先测试直接从全局内存中排序
 global_BlockSortKernel<<< 1，nTPB>>>（）; // warm up run 
 cudaDeviceSynchronize（）; 
 cudaMemcpyToSymbol（my_val，h_data，DSIZE * sizeof（int））; 
 cudaCheckErrors（memcpy to symbol fail）; 
 cudaEventRecord（start）; 
 global_BlockSortKernel<<< 1，nTPB>>>（）; // timing run 
 cudaEventRecord（stop）; 
 cudaDeviceSynchronize（）; 
 cudaCheckErrors（cub 1 fail）; 
 cudaEventSynchronize（stop）; 
 float et; 
 cudaEventElapsedTime（& et，start，stop）; 
 cudaMemcpyFromSymbol（h_result，my_val，DSIZE * sizeof（int））; 
 cudaCheckErrors（memcpy from symbol fail）; 
 if（！thrust :: is_sorted（h_result，h_result + DSIZE））{printf（sort 1 fail！\\\
）; return 1;} 
 printf（global Elapsed time：％fms\\\
，et）; 
 printf（global Kkeys / s：％d\\\
，（int）（DSIZE / et））; 
 //现在测试原来的CUB块排序复制全局到共享
 int * d_in，* d_out; 
 cudaMalloc（（void **）& d_in，DSIZE * sizeof（int））; 
 cudaMalloc（（void **）& d_out，DSIZE * sizeof（int））; 
 cudaCheckErrors（cudaMalloc fail）; 
 BlockSortKernel< nTPB，ELEMS_PER_THREAD><<<< 1，nTPB>>（d_in，d_out）; // warm up run 
 cudaMemcpy（d_in，h_data，DSIZE * sizeof（int），cudaMemcpyHostToDevice）; 
 cudaEventRecord（start）; 
 BlockSortKernel< nTPB，ELEMS_PER_THREAD><<<< 1，nTPB>>（d_in，d_out）; // timing run 
 cudaEventRecord（stop）; 
 cudaDeviceSynchronize（）; 
 cudaCheckErrors（cub 2 fail）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& et，start，stop）; 
 cudaMemcpy（h_result，d_out，DSIZE * sizeof（int），cudaMemcpyDeviceToHost）; 
 cudaCheckErrors（cudaMemcpy D to H fail）; 
 if（！thrust :: is_sorted（h_result，h_result + DSIZE））{printf（sort 2 fail！\\\
）; return 1;} 
 printf（CUB Elapsed time：％fms\\\
，et）; 
 printf（CUB Kkeys / s：％d\\\
，（int）（DSIZE / et））; 
 //现在测试块排序的共享内存版本
 shared_BlockSortKernel< nTPB，ELEMS_PER_THREAD><<< 1，nTPB>>（d_out）; // warm-up run 
 cudaEventRecord（start）; 
 shared_BlockSortKernel< nTPB，ELEMS_PER_THREAD><<< 1，nTPB>>（d_out）; // timing run 
 cudaEventRecord（stop）; 
 cudaDeviceSynchronize（）; 
 cudaCheckErrors（cub 3 fail）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& et，start，stop）; 
 printf（shared Elapsed time：％fms\\\
，et）; 
 printf（shared Kkeys / s：％d\\\
，（int）（DSIZE / et））; 
 return 0; 
} 
 $ nvcc -O3 -arch = sm_20 -o t10 t10.cu 
 $ ./t10 
全局已用时间：0.236960ms 
全局Kkeys / s： 4321 
 CUB已用时间：0.042816ms 
 CUB Kkeys / s：23916 
共享已用时间：0.040192ms 
共享Kkeys / s：25477 
 $

对于这个测试，我使用CUDA 6.0RC，cub v1.2.0（很新），RHEL5.5 /gcc4.1.2和Quadro5000 GPU（cc2.0，11SMs，比GTX480慢约40％）。这是我发生的一些观察：

原始cub排序（2）与全局内存排序（1）约为6:1，其约为共享存储器（〜1TB / s）与全局存储器（〜150GB / s）的带宽比。

原始cub排序（2）具有吞吐量，当为SM（11）的数量缩放时，产生263MKeys / s，是最佳设备范围排序的相当大的分数我在此设备上看到了（，产生〜480M键/秒）

仅对共享内存进行排序并不比将全局内存的输入/输出复制到全局内存的原始cub排序快得多，表明从全局内存到cub临时存储的副本不是整体处理的很大一部分

6：1罚款是一个很大的罚款。所以我的建议是，如果可能使用一个设备范围的排序问题大小比可以通过cub块排序容易处理。这允许你利用一些最好的GPU代码编写器的专业知识为您的排序，并实现吞吐量更接近的设备作为一个整体能够。

请注意，因此我可以在类似的条件下测试，这里的问题大小（512线程，每个线程2个元素）不超过你可以在一个CUB块排序。但是，不难将数据集大小扩展为更大的值（比如每个线程1024个元素），这只能使用第一种方法处理（在这种情况下，在这些上下文中）。如果我做的更大的问题大小像这样，在我的GPU上我观察到全球内存块排序在我的cc2.0设备约6Mkeys / s的吞吐量。

When using cub::BlockRadixSort to do the sorting within a block, if the number of elements is too large, how do we deal with that? If we set a tile size to be too large, the shared memory for the temporary storage will soon not able to hold it. If we split it into multiple tiles, how do we post-process it after we sorted each tile?

解决方案

Caveat: I am not a cub expert (far from it).
You might want to review this question/answer as I'm building on some of the work I did there.
Certainly if the problem size is large enough, then a device-wide sort would seem to be something you might want to consider. But your question seems focused on block sorting.

From my testing, cub doesn't really have requirements around where your original data is located, or where you place the temp storage. Therefore, one possible solution would be simply to place your temp storage in global memory. To analyze this, I created a code that has 3 different test cases:

Test a version of cub block sort with the temp storage in global memory.
Test the original version of cub block sort adapted from the example here
Test a version of cub block sort derived from my previous answer, where there is no copying of data to/from global memory, ie. it is assumed that the data is already resident "on-chip" i.e. in shared memory.

None of this is extensively tested, but since I am building on cub building blocks, and testing my results in the first two cases, hopefully I have not made any grievous errors. Here's the full test code, and I will make additional comments below:

$ cat t10.cu
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#define nTPB 512
#define ELEMS_PER_THREAD 2
#define RANGE (nTPB*ELEMS_PER_THREAD)
#define DSIZE (nTPB*ELEMS_PER_THREAD)



#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

using namespace cub;
// GLOBAL CUB BLOCK SORT KERNEL
// Specialize BlockRadixSort collective types
typedef BlockRadixSort<int, nTPB, ELEMS_PER_THREAD> my_block_sort;
__device__ int my_val[DSIZE];
__device__ typename my_block_sort::TempStorage sort_temp_stg;

// Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
__global__ void global_BlockSortKernel()
{
    // Collectively sort the keys
    my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ELEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ELEMS_PER_THREAD))));

}

// ORIGINAL CUB BLOCK SORT KERNEL
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
// Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
  typedef cub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadT;
  typedef cub::BlockStore<int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreT;
  typedef cub::BlockRadixSort<int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// Allocate type-safe, repurposable shared memory for collectives
  __shared__ union {
    typename BlockLoadT::TempStorage load;
    typename BlockStoreT::TempStorage store;
    typename BlockRadixSortT::TempStorage sort;
    } temp_storage;
// Obtain this block's segment of consecutive keys (blocked across threads)
  int thread_keys[ITEMS_PER_THREAD];
  int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
  BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys);
  __syncthreads(); // Barrier for smem reuse
// Collectively sort the keys
  BlockRadixSortT(temp_storage.sort).Sort(thread_keys);
  __syncthreads(); // Barrier for smem reuse
// Store the sorted segment
  BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys);
}



// SHARED MEM CUB BLOCK SORT KERNEL
// Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(int *d_out)
{
    __shared__ int my_val[BLOCK_THREADS*ITEMS_PER_THREAD];
    // Specialize BlockRadixSort collective types
    typedef BlockRadixSort<int, BLOCK_THREADS, ITEMS_PER_THREAD> my_block_sort;
    // Allocate shared memory for collectives
    __shared__ typename my_block_sort::TempStorage sort_temp_stg;

    // need to extend synthetic data for ELEMS_PER_THREAD > 1
    my_val[threadIdx.x*ITEMS_PER_THREAD]  = (threadIdx.x + 5); // synth data
    my_val[threadIdx.x*ITEMS_PER_THREAD+1]  = (threadIdx.x + BLOCK_THREADS + 5); // synth data
    __syncthreads();
//    printf("thread %d data = %d\n", threadIdx.x,  my_val[threadIdx.x*ITEMS_PER_THREAD]);

    // Collectively sort the keys
    my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ITEMS_PER_THREAD))));
    __syncthreads();

//    printf("thread %d sorted data = %d\n", threadIdx.x,  my_val[threadIdx.x*ITEMS_PER_THREAD]);
    if (threadIdx.x == clock()){ // dummy to prevent compiler optimization
      d_out[threadIdx.x*ITEMS_PER_THREAD] = my_val[threadIdx.x*ITEMS_PER_THREAD];
      d_out[threadIdx.x*ITEMS_PER_THREAD+1] = my_val[threadIdx.x*ITEMS_PER_THREAD+1];}
}


int main(){
    int *h_data, *h_result;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    h_data=(int *)malloc(DSIZE*sizeof(int));
    h_result=(int *)malloc(DSIZE*sizeof(int));
    if (h_data == 0) {printf("malloc fail\n"); return 1;}
    if (h_result == 0) {printf("malloc fail\n"); return 1;}
    for (int i = 0 ; i < DSIZE; i++) h_data[i] = rand()%RANGE;
    // first test sorting directly out of global memory
    global_BlockSortKernel<<<1,nTPB>>>(); //warm up run
    cudaDeviceSynchronize();
    cudaMemcpyToSymbol(my_val, h_data, DSIZE*sizeof(int));
    cudaCheckErrors("memcpy to symbol fail");
    cudaEventRecord(start);
    global_BlockSortKernel<<<1,nTPB>>>(); //timing run
    cudaEventRecord(stop);
    cudaDeviceSynchronize();
    cudaCheckErrors("cub 1 fail");
    cudaEventSynchronize(stop);
    float et;
    cudaEventElapsedTime(&et, start, stop);
    cudaMemcpyFromSymbol(h_result, my_val, DSIZE*sizeof(int));
    cudaCheckErrors("memcpy from symbol fail");
    if(!thrust::is_sorted(h_result, h_result+DSIZE)) { printf("sort 1 fail!\n"); return 1;}
    printf("global Elapsed time: %fms\n", et);
    printf("global Kkeys/s: %d\n", (int)(DSIZE/et));
    // now test original CUB block sort copying global to shared
    int *d_in, *d_out;
    cudaMalloc((void **)&d_in, DSIZE*sizeof(int));
    cudaMalloc((void **)&d_out, DSIZE*sizeof(int));
    cudaCheckErrors("cudaMalloc fail");
    BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_in, d_out); // warm up run
    cudaMemcpy(d_in, h_data, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaEventRecord(start);
    BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_in, d_out); // timing run
    cudaEventRecord(stop);
    cudaDeviceSynchronize();
    cudaCheckErrors("cub 2 fail");
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&et, start, stop);
    cudaMemcpy(h_result, d_out, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy D to H fail");
    if(!thrust::is_sorted(h_result, h_result+DSIZE)) { printf("sort 2 fail!\n"); return 1;}
    printf("CUB Elapsed time: %fms\n", et);
    printf("CUB Kkeys/s: %d\n", (int)(DSIZE/et));
    // now test shared memory-only version of block sort
    shared_BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_out); // warm-up run
    cudaEventRecord(start);
    shared_BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_out); // timing run
    cudaEventRecord(stop);
    cudaDeviceSynchronize();
    cudaCheckErrors("cub 3 fail");
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&et, start, stop);
    printf("shared Elapsed time: %fms\n", et);
    printf("shared Kkeys/s: %d\n", (int)(DSIZE/et));
    return 0;
}
$ nvcc -O3 -arch=sm_20 -o t10 t10.cu
$ ./t10
global Elapsed time: 0.236960ms
global Kkeys/s: 4321
CUB Elapsed time: 0.042816ms
CUB Kkeys/s: 23916
shared Elapsed time: 0.040192ms
shared Kkeys/s: 25477
$

For this test, I am using CUDA 6.0RC, cub v1.2.0 (which is pretty recent), RHEL5.5/gcc4.1.2, and a Quadro5000 GPU (cc2.0, 11SMs, approximately 40% slower than a GTX480). Here are some observations that occur to me:

The speed ratio of the original cub sort(2) to the global memory sort(1) is approximately 6:1, which is approximately the bandwidth ratio of shared memory (~1TB/s) to global memory (~150GB/s).
The original cub sort(2) has a throughput, that when scaled for the number of SMs (11), yielding 263MKeys/s, is a sizeable fraction of the best device-wide sort I have seen on this device (thrust sort, yielding ~480MKeys/s)
The shared-memory only sort is not much faster than the original cub sort which copies input/output from/to global memory, indicating the copy from global memory to the cub temp storage is not a large fraction of the overall processing time.

The 6:1 penalty is a large one to pay. So my recommendation would be, if possible to use a device-wide sort on problem sizes larger than what can be handled easily by cub block sorting. This allows you to tap into the expertise of some of the best GPU code writers for your sorting, and achieve throughputs much closer to what the device as a whole is capable of.

Note that so I could test under similar conditions, the problem size here (512 threads, 2 elements per thread) does not exceed what you can do in a CUB block sort. But it's not difficult to extend the data set size to larger values (say, 1024 elements per thread) that can be only handled (in this context, among these choices) using the first approach. If I do larger problem sizes like that, on my GPU I observe a throughput of around 6Mkeys/s for the global memory block sort on my cc2.0 device.

这篇关于cub BlockRadixSort：如何处理大瓦片大小或排序多个瓦片？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！

With

cub BlockRadixSort：如何处理大瓦片大小或排序多个瓦片？

问题描述