c++ - 进行少量插入/移位的并行算法

假设我有一个由8个数字组成的数组a，我有另一个由数字组成的数组b来确定a中的数字应该向右移动多少个位置
A 3，6，7，8，1，2，3，5
B 0，1，0，0，0，0，0，0，0
0表示有效，1表示该数字应在后1位，输出数组应在后3位之间插入0，输出数组C应为：
C:3,0,6,7,8,1,2,3
是否插入0或其他内容并不重要，关键是3之后的所有数字都被移了一个位置。出站号码将不再在数组中。
另一个例子：
A 3，6，7，8，1，2，3，5
B 0，1，0，0，2，0，0，0
C 3，0，6，7，8，0，1，2
…………
A 3，6，7，8，1，2，3，5
B 0，1，0，0，1，0，0，0
C 3，0，6，7，8，1，2，3
我正在考虑使用scan/prefix sum或类似的方法来解决这个问题而且这个数组很小，我应该能够将数组放在一个warp（

最佳答案

一种可能的方法。
由于移位的模糊性（0, 1, 0, 1，0, 1, 1, 1和0, 1, 0 ,0都会产生相同的数据偏移模式，例如），因此不可能只创建移位模式的前缀和来产生每个位置的相对偏移但是，我们可以观察到，如果移位模式中的每个零都被其左侧的第一个非零移位值替换，则将创建有效的偏移模式：

0, 1, 0, 0   (shift pattern)
0, 1, 1, 1   (offset pattern)

或

0, 2, 0, 2   (shift pattern)
0, 2, 2, 2   (offset pattern)

那么怎么做呢？假设我们有第二个测试用例转移模式：

      0, 1, 0, 0, 2, 0, 0, 0

我们希望的偏移模式是：

      0, 1, 1, 1, 2, 2, 2, 2

对于给定的移位模式，创建一个二进制值，其中如果移位模式中对应索引处的值为零，则每个位为一，否则为零我们可以使用warp vote指令，称为__ballot()每个车道将从选票中获得相同的值：

  1  0  1  1  0  1  1  1  (this is a single binary 8-bit value in this case)

现在，每个扭曲通道都将接受该值，并为其添加一个在扭曲通道位置有1位的值。将车道1用于示例的其余部分：

+ 0  0  0  0  0  0  1  0  (the only 1 bit in this value will be at the lane index)
= 1  0  1  1  1  0  0  1

我们现在采用步骤2的结果，并使用步骤1的结果按位异或：

= 0  0  0  0  1  1  1  0

现在，我们计算这个值中的1位数（这个值有一个__popc()intrinsic），然后从结果中减去1。因此，对于上面的lane 1示例，该步骤的结果将是2，因为设置了3位这将使用到我们左边第一个值的距离，该值在原始移位模式中是非零的。所以对于1号车道的例子，1号车道左边的第一个非零值高出2号车道，即3号车道。
对于每个车道，我们使用步骤4的结果来获取该车道的适当偏移值。我们可以使用__shfl_down()warp shuffle指令一次处理所有车道。

  0, 1, 1, 1, 2, 2, 2, 2

从而产生我们想要的“偏移模式”。
一旦我们有了所需的偏移模式，让每个扭曲通道使用其偏移值来适当地移动其数据项的过程就很简单了。
下面是一个完整的例子，使用您的3个测试用例上面的步骤1-4包含在__device__函数mydelta中。其余的内核正在执行步骤5洗牌，适当地索引到数据，并复制数据。由于warp shuffle指令的使用，我们必须为cc3.0或更高的gpu编译它。（但是，用其他索引代码替换经卷洗牌指令并不困难，这将允许对cc2.0或更大的设备进行操作。）而且，由于使用的各种本质，这个函数不能为32个以上的数据项工作，但这是在您的问题中陈述的先决条件。

$ cat t475.cu
#include <stdio.h>
#define DSIZE 8

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__device__ int mydelta(const int shift){
  unsigned nz = __ballot(shift == 0);
  unsigned mylane = (threadIdx.x & 31);
  unsigned lanebit = 1<<mylane;
  unsigned temp = nz + lanebit;
  temp = nz ^ temp;
  unsigned delta = __popc(temp);
  return delta-1;
}
__global__ void mykernel(const int *data, const unsigned *shift, int *result, const int limit){ // limit <= 32
  if (threadIdx.x < limit){
    unsigned lshift = shift[(limit - 1) - threadIdx.x];
    unsigned delta = mydelta(lshift);
    unsigned myshift = __shfl_down(lshift, delta);
    myshift = __shfl(myshift, ((limit -1) - threadIdx.x)); // reverse offset pattern
    result[threadIdx.x] = 0;
    if ((myshift + threadIdx.x) < limit)
    result[threadIdx.x + myshift] = data[threadIdx.x];
  }
}

int main(){
  int A[DSIZE]         = {3, 6, 7, 8, 1, 2, 3, 5};
  unsigned tc1B[DSIZE] = {0, 1, 0, 0, 0, 0, 0, 0};
  unsigned tc2B[DSIZE] = {0, 1, 0, 0, 2, 0, 0, 0};
  unsigned tc3B[DSIZE] = {0, 1, 0, 0, 1, 0, 0, 0};

  int *d_data, *d_result, *h_result;
  unsigned *d_shift;
  h_result = (int *)malloc(DSIZE*sizeof(int));
  if (h_result == NULL) { printf("malloc fail\n"); return 1;}
  cudaMalloc(&d_data, DSIZE*sizeof(int));
  cudaMalloc(&d_shift, DSIZE*sizeof(unsigned));
  cudaMalloc(&d_result, DSIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  cudaMemcpy(d_data, A, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_shift, tc1B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMempcyH2D fail");
  mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMempcyD2H fail");
  printf("index: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", i);
  printf("\nA:     ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", A[i]);
  printf("\ntc1 B: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", tc1B[i]);
  printf("\ntc1 C: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", h_result[i]);
  cudaMemcpy(d_shift, tc2B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMempcyH2D fail");
  mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMempcyD2H fail");
  printf("\ntc2 B: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", tc2B[i]);
  printf("\ntc2 C: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", h_result[i]);
  cudaMemcpy(d_shift, tc3B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMempcyH2D fail");
  mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMempcyD2H fail");
  printf("\ntc3 B: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", tc3B[i]);
  printf("\ntc2 C: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", h_result[i]);
  printf("\n");
  return 0;
}
$ nvcc -arch=sm_35 -o t475 t475.cu
$ ./t475
index: 0, 1, 2, 3, 4, 5, 6, 7,
A:     3, 6, 7, 8, 1, 2, 3, 5,
tc1 B: 0, 1, 0, 0, 0, 0, 0, 0,
tc1 C: 3, 0, 6, 7, 8, 1, 2, 3,
tc2 B: 0, 1, 0, 0, 2, 0, 0, 0,
tc2 C: 3, 0, 6, 7, 8, 0, 1, 2,
tc3 B: 0, 1, 0, 0, 1, 0, 0, 0,
tc2 C: 3, 0, 6, 7, 8, 1, 2, 3,
$

关于c++ - 进行少量插入/移位的并行算法，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/24663819/