c++ - 在CUDA中处理4D张量的内核

我想编写一个内核来执行依赖于索引（ij | kl）的所有唯一四进制的计算。在主机上生成所有唯一四重奏的代码如下：#include <iostream>using namespace std;int main(int argc, char** argv){ unsigned int i,j,k,l,ltop; unsigned int nao=7; for(i=0;i<nao;i++) { for(j=0;j<=i;j++) { for(k=0;k<=i;k++) { ltop=k; if(i==k) { ltop=j; } for(l=0;l<=ltop; l++) { printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l); } } } } int m = nao*(nao+1)/2; int eris_size = m*(m+1)/2; cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl; return 0;}输出：computing the ERI (0,0|0,0)computing the ERI (1,0|0,0)computing the ERI (1,0|1,0)computing the ERI (1,1|0,0)computing the ERI (1,1|1,0)computing the ERI (1,1|1,1)computing the ERI (2,0|0,0)computing the ERI (2,0|1,0)computing the ERI (2,0|1,1)computing the ERI (2,0|2,0)computing the ERI (2,1|0,0)computing the ERI (2,1|1,0)computing the ERI (2,1|1,1)computing the ERI (2,1|2,0)computing the ERI (2,1|2,1)computing the ERI (2,2|0,0)computing the ERI (2,2|1,0)computing the ERI (2,2|1,1)computing the ERI (2,2|2,0)computing the ERI (2,2|2,1)computing the ERI (2,2|2,2)computing the ERI (3,0|0,0)computing the ERI (3,0|1,0)computing the ERI (3,0|1,1)computing the ERI (3,0|2,0)computing the ERI (3,0|2,1)computing the ERI (3,0|2,2)computing the ERI (3,0|3,0)computing the ERI (3,1|0,0)computing the ERI (3,1|1,0)computing the ERI (3,1|1,1)computing the ERI (3,1|2,0)computing the ERI (3,1|2,1)computing the ERI (3,1|2,2)computing the ERI (3,1|3,0)computing the ERI (3,1|3,1)computing the ERI (3,2|0,0)computing the ERI (3,2|1,0)computing the ERI (3,2|1,1)computing the ERI (3,2|2,0)computing the ERI (3,2|2,1)computing the ERI (3,2|2,2)computing the ERI (3,2|3,0)computing the ERI (3,2|3,1)computing the ERI (3,2|3,2)computing the ERI (3,3|0,0)computing the ERI (3,3|1,0)computing the ERI (3,3|1,1)computing the ERI (3,3|2,0)computing the ERI (3,3|2,1)computing the ERI (3,3|2,2)computing the ERI (3,3|3,0)computing the ERI (3,3|3,1)computing the ERI (3,3|3,2)computing the ERI (3,3|3,3)我想使用CUDA内核恢复相同的四重奏集，但无法获取。我现在拥有的CUDA的代码如下#include <iostream>#include <stdio.h>using namespace std;#define ABS(x) (x<0)?-x:x__global__void test_kernel(int basis_size){ unsigned int i_idx = threadIdx.x + blockIdx.x * blockDim.x; unsigned int j_idx = threadIdx.y + blockIdx.y * blockDim.y; // Building the quartets unsigned int i_orbital, j_orbital, k_orbital, l_orbital, i__1,j__1; unsigned int i_primitive, j_primitive, k_primitive, l_primitive; i_orbital = (i_idx + 1)%basis_size /*+1*/; j_orbital = (i__1 = (i_idx) / basis_size, ABS(i__1)) /*+ 1*/; k_orbital = (j_idx+1)%basis_size /*+1*/; l_orbital = (j__1 = (j_idx) / basis_size, ABS(j__1)) /*+ 1*/; unsigned int ltop; ltop=k_orbital; if(i_orbital==k_orbital) { ltop=j_orbital; } if(i_orbital<basis_size && j_orbital<=i_orbital && k_orbital<=i_orbital && l_orbital<=ltop) printf("computing the ERI (%d,%d|%d,%d) \n", i_orbital, j_orbital,k_orbital,l_orbital);}int main(int argc, char *argv[]){ int nao = 7; cudaDeviceReset(); /* partitioning from blocks to grids */ int dimx = 8; int dimy = 8; dim3 block(dimx, dimy); // we will try blocks of 8x8 threads dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y); // The grids are shaped accordingly /* Launch the kernel */ test_kernel<<<grid,block>>>(nao); cudaDeviceReset(); return 0;}输出：computing the ERI (2,0|1,1)computing the ERI (3,1|3,1)computing the ERI (3,0|1,1)computing the ERI (1,1|1,1)computing the ERI (2,1|1,1)computing the ERI (3,1|1,1)computing the ERI (3,0|2,1)computing the ERI (2,1|2,1)computing the ERI (3,1|2,1)computing the ERI (1,0|1,0)computing the ERI (2,0|1,0)computing the ERI (3,0|1,0)computing the ERI (1,1|1,0)computing the ERI (2,1|1,0)computing the ERI (3,1|1,0)computing the ERI (2,0|2,0)computing the ERI (3,0|3,0)computing the ERI (3,0|2,0)computing the ERI (2,1|2,0)computing the ERI (3,1|3,0)computing the ERI (3,1|2,0)computing the ERI (1,0|0,0)computing the ERI (2,0|0,0)computing the ERI (3,0|0,0)computing the ERI (0,0|0,0)computing the ERI (1,1|0,0)computing the ERI (2,1|0,0)computing the ERI (3,1|0,0)四重奏将驱动斥力积分的计算，其输入参数存储在大小为nao的数组中，其大小为3 最佳答案如前所述，您的代码不会做很多事情。它生成索引并打印出来。此外，还不清楚nao=7是否实际上描述了最终问题的大小，还是仅用于演示目的。纯粹从索引生成的角度来看，有很多方法可以解决您的问题。这些中的某些自然会适合于GPU的高效使用，而有些则可能并非如此。但是，到目前为止您所显示的代码并不能真正确定GPU的有效使用。例如，CUDA程序的理想目标之一是从全局内存中合并访问。由于您的代码没有显示任何内容，因此在不知道所生成的访问模式可能是什么的情况下提出解决方案会有些冒险。因此，您试图解决的“实际问题”可能会在做出有关索引生成的最终决定（实际上等同于“将工作分配给给定线程”）之前具有一些应考虑的考虑因素。我会尝试提及其中一些。对您的代码的一些批评：您建议的实现建议使用2D线程块和网格结构来有效提供程序使用的i，j索引。但是，通过检查原始C源代码，我们看到j索引仅覆盖直到i索引的空间：for(i=0;i<nao;i++){ for(j=0;j<=i;j++)但是，将这种结构替换为2D网格/线程块结构会创建一个矩形空间，原始C代码中的嵌套for循环并不暗示该矩形空间。因此，发射这样的空间/网格将导致创建i，j索引超出范围；这些线程不需要做任何工作。我们可以通过“重新利用”超出范围的线程来覆盖一些额外的空间来解决此问题（也许这就是您的代码正在尝试做的事情），但这会导致相当复杂且难以理解的算法，并且看到下一个批评。您的内核中没有循环，因此很明显每个线程只能生成一行打印输出。因此，每个线程只能负责一个ERI条目。根据您提供的C源代码，对于nao大小为7，您预期会有406个ERI条目（对于您显示的代码，BTW您发布的打印输出似乎不完整）。如果每个线程有一个打印输出，并且需要覆盖406个ERI条目的空间，则最好至少有406个线程，否则我们提出的解决方案将无法正常工作。如果我们检查您的代码以确定线程的网格大小：int dimx = 8;int dimy = 8;dim3 block(dimx, dimy); // we will try blocks of 8x8 threadsdim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y); // The grids are shaped accordingly我们将得出一个结论，即您根本没有启动足够的线程。如果进行上述计算（如果不确定，则可能需要打印出block.x，.y和grid.x，.y值），您会发现您正在启动一个8x8线程块，即总共64个线程，对于 of7。64个线程（每线程一个打印输出）不能覆盖406个条目的ERI空间。实际上，假设您的nao受printf语句限制，则可能是每个线程的打印输出少于一个。可能的解决方案：一个相当简单的方法就是将C源代码中的外部两个for循环映射到2D网格的x，y索引。然后，我们将保留或多或少完整地保留C源代码的内部for循环结构作为我们的内核代码。这是相当容易编写的。这样做的缺点是它将启动一些不执行任何操作的线程（我们必须检查j> i的条件，并检查此类线程不执行操作的条件），但这可能是一个较小的考虑。一个更大的问题可能是我们正在生成多少实际线程。对于给定的if为7，这将启动7x7 = 49个线程（其中一些不起作用）。对于GPU来说，问题大小为49，甚至406个线程是很小的，并且由于线程数量有限，它无法达到峰值性能。此问题的“三角”性质（j linear-to-triangular index mapping没有“浪费”线程。但是，正如已经讨论的那样，比约50％的线程浪费更为重要的考虑因素是全球访问的合并。另一种可能的方法是使用@AngryLettuce建议的方法对以前的一个问题（可能现在已删除）进行评论。具体来说，生成一维网格和一维全局唯一线程索引，然后使用算术将一维索引转换为子索引来计算必要的子索引（i，j，k，l）。这样做的好处是，对于较小的问题，我们可以直接启动较大的内核。例如，对于您的406个ERI条目的问题空间，我们将生成（至少）406个线程，其唯一索引为0..405，并将该1-D索引转换为4个子索引（i，j，k， l）。查看您的内核代码，这可能就是您要尝试执行的操作。但是，由于您的空间是奇形的，因此我认为将线性索引（或任何矩形索引集）转换为奇形空间的算法将很复杂。如果您的实际问题空间很大（nao远远大于7），那么我个人会选择第一种方法，因为它是更具可读性的代码（IMO），并且易于编写/维护。对于较小的问题空间，由于已经讨论的原因，第二种方法可能更好。对于以上两种方法中的任何一种，我们都希望研究将产生的全局内存访问模式。这将取决于您未显示的数据组织。第一种方法可能更易于编写以生成合并访问，但是（在理论上）仔细使用第二种方法中的索引生成算法应该也可以使您实现所需的任何访问模式。这是方法1的完整代码：#include <stdio.h>#include <iostream>using namespace std;__global__ void kernel(unsigned int nao){ unsigned i = threadIdx.x+blockDim.x*blockIdx.x; unsigned j = threadIdx.y+blockDim.y*blockIdx.y; if (i >= nao) return; if (j > i) return; // modify if use of __syncthreads() after this unsigned int k,l,ltop; for(k=0;k<=i;k++) { ltop=k; if(i==k) { ltop=j; } for(l=0;l<=ltop; l++) { printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l); } }}int main(){ unsigned int nao = 7; dim3 block(4,4); dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y); kernel<<<grid,block>>>(nao); cudaDeviceSynchronize(); int m = nao*(nao+1)/2; int eris_size = m*(m+1)/2; cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl; return 0;}请注意，为了简化本代码中的演示，我使用了条件返回语句（对于不起作用的线程）。但是，如果需要在内核中使用nao，则需要修改内核结构以避免有条件的返回，而是允许“越界”线程在不执行任何工作的情况下继续执行内核代码。关于__syncthreads()用法的许多其他SO问题也涵盖了此概念。还要注意，内核打印输出可以以任何顺序发生（就像线程可以以任何顺序执行一样），因此我仅验证了上述方法似乎可以工作并产生了所需的406行打印输出。更好的验证方法是避免使用printf，而是使用计数矩阵。对于第二种方法，由于您的多维空间“形状奇特”，因此从单个线性索引到多维空间（具有1：1映射并且没有浪费/未使用的索引）的转换相当复杂， ”。我没有将线性索引转换为子索引的“绝佳”方法。我必须根据i，j，k索引为__syncthreads()循环绘制各种问题空间尺寸大小：i=0, j=0, k=0, l=0 (total 1 iteration for i=0)l loop limit, for i=1: (total 5 iterations for i=1) j 0 1k 0 0 0 1 0 1l loop limit, for i=2: (total 15 iterations for i=2) j 0 1 2k 0 0 0 0 1 1 1 1 2 0 1 2l loop limit, for i=3: (total 34 iterations for i=3) j 0 1 2 3k 0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 0 1 2 3 etc.（请注意，另一种映射方法是，将上述每个矩阵的最后一行与其他行一样，视为等于l的常量，这将导致一些线程浪费，但下面将大大简化索引计算。我们将在下面讨论简要介绍了无浪费线程方法，但最终给出了一个使用浪费线程方法的示例，因为索引计算的算术和时间复杂度都大大简化了。通过上面的研究，我们看到映射到给定的k循环迭代的迭代次数为：[((i+2)*(i+1)/2)*(i+1)] - (i+1)*i/2要么：(((i+2)*(i+1)-i)/2)*(i+1)以上计算可用于基于线性索引确定i的索引。我知道使用上述断点解析线性索引的唯一方法是通过二进制搜索。然后，我们将重复上述方法（确定每个“级别”的迭代次数，并使用二进制搜索将线性索引映射到级别索引）来计算下一个索引j和k。剩余数量将是l指数。为了简化讨论的其余部分，我改用了修改后的映射版本：l loop limit, for i=1: (total 6 iterations for i=1) j 0 1k 0 0 0 1 1 1l loop limit, for i=2: (total 18 iterations for i=2) j 0 1 2k 0 0 0 0 1 1 1 1 2 2 2 2l loop limit, for i=3: (total 40 iterations for i=3) j 0 1 2 3k 0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 etc.从中我们将有一些浪费的线程，这些线程将在内核中使用if条件进行处理。上面每个i级的迭代次数仅为：(i+2)*(i+1)*(i+1)/2给定线性索引后，根据上述公式计算了i-index（在summation of the above polynomial上使用二进制搜索），然后可以通过将剩余空间划分为i个相等大小的片段来轻松计算下一个索引（j）。可以使用triangular mapping method找到下一个k索引，然后剩余空间成为我们的l循环范围。但是，我们必须记住要像前面讨论的那样对这个l索引进行条件处理。这是方法2的完整代码：#include <stdio.h>#include <iostream>// the closed-form summation of the polynomial (i+2)*(i+1)*(i+1)/2 from 0 to n__host__ __device__ unsigned my_func(unsigned i){ return 1+(2*i+(((i*(i+1))/2)*((i*(i+1))/2)) + (i*(i+1)*((2*i)+1)*2)/3 + ((5*i*(i+1))/2))/2; }// binary search__host__ __device__ unsigned bsearch_functional(const unsigned key, const unsigned len_a, unsigned (*func)(unsigned)){ unsigned lower = 0; unsigned upper = len_a; unsigned midpt; while (lower < upper){ midpt = (lower + upper)>>1; if (func(midpt) <= key) lower = midpt +1; else upper = midpt; } return lower;}// conversion of linear index to triangular matrix (row,col) index__host__ __device__ void linear_to_triangular(const unsigned i, const unsigned n, unsigned *trow, unsigned *tcol){ int c = i; int row = c/(n-1); int col = c%(n-1); if (col < row) { col = (n-2)-col; row = (n-1)-row; } *tcol = col+1; *trow = row;}__global__ void kernel(unsigned int nao, unsigned int eris_size){ unsigned idx = threadIdx.x+blockDim.x*blockIdx.x; if (idx < eris_size){ // compute i-index via binary search unsigned int i = bsearch_functional(idx, nao, my_func); // compute j-index via division of the space by i; unsigned int j = (idx==0)?0:(idx-my_func(i-1))/((my_func(i)-my_func(i-1))/(i+1)); unsigned k,l; linear_to_triangular((idx-my_func(i-1)-(j *((my_func(i)-my_func(i-1))/(i+1)))), (i+2), &k, &l); k = i-k; l = (i+1)-l; if (idx == 0) {k=0; l=0;} if (l <= ((i==k)?j:k)) printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l); }}int main(){ unsigned int nao = 7; int m = nao*(nao+1)/2; int eris_size = m*(m+1)/2; const int nTPB = 64; std::cout<<"The total size of the sack of ERIs is: "<<eris_size<<std::endl; int mod_eris_size = my_func(nao-1); kernel<<<mod_eris_size+nTPB-1/nTPB, nTPB>>>(nao, mod_eris_size); cudaDeviceSynchronize(); return 0;}需要明确的是，我不确切知道您的任务是什么，并且我不保证这些示例对于任何给定的用法都是没有错误的。我的目的不是给您一个黑匣子，而是向您解释一些可能的编程方法。我没有进行严格的验证，只是观察到每种方法都产生了406行ERI输出，与您的原始C代码相同。最后，为了简洁起见，我省略了proper cuda error checking，但是任何时候在使用cuda代码时遇到麻烦，我建议使用它并在i中运行代码。关于c++ - 在CUDA中处理4D张量的内核，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/34529387/