每个向量元素中的多个和

每个向量元素中的多个和

本文介绍了Cuda-每个向量元素中的多个和的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧! 问题描述 29岁程序员,3月因学历无情被辞! 两个系列的系数分别为a和b的Chebyshev多项式的乘积可以用公式表示。 问题是要尽可能并行化。 我已经成功地使用cuda通过简单地对每个向量元素应用一个线程来并行化上述公式。因此,一个线程执行求和/乘法。 #include< stdio.h> #include< iostream> #include< cuda.h> #include< time.h> __global__ void chebyprod(int n,float * a,float * b,float * c){ int i = blockIdx.x * blockDim.x + threadIdx.x; 浮动金额; if(i< n){ sum = 0.f; (int j = 0; j sum + = a [j] * b [j-i]; } for(int j = 1; j< ni; j ++){ sum + = a [j] * b [j + i] + a [j + i] * b [j]; } c [i] = 0.5f * sum; } / * 如果(i c [i] = a [i] + b [i]; * / } int main(void){ clock_t tStart = clock(); int N = 10000; 浮点数* a,* b,* c,* d_a,* d_b,* d_c; a =(float *)malloc(N * sizeof(float)); b =(float *)malloc(N * sizeof(float)); c =(float *)malloc(N * sizeof(float)); cudaMalloc(& d_a,N * sizeof(float)); cudaMalloc(& d_b,N * sizeof(float)); cudaMalloc(& d_c,N * sizeof(float)); (int i = 0; i a [i] = 0.1f; b [i] = 0.2f; } cudaMemcpy(d_a,a,N * sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,N * sizeof(float),cudaMemcpyHostToDevice); int blockSize,gridSize; //每个线程块中的线程数 blockSize = 1024; //网格中的线程块数 gridSize =(int)ceil((float)N / blockSize); std :: cout<< blockSize:<< blockSize<< \ngridSize:<< gridSize<< \n; //对N个元素执行chebyprod chebyprod<<< gridSize,blockSize>>(N,d_a,d_b,d_c); printf(耗时:%.2fs\n,(double)(clock()-tStart)/ CLOCKS_PER_SEC); cudaMemcpy(c,d_c,N * sizeof(float),cudaMemcpyDeviceToHost); std :: cout<< 向量c:[; (int k = 0; k std :: cout<< c [k]<< ; std :: cout<<] \n; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 免费(a); 免费(b); 免费(c); } 在另一个代码中,我设法使用总和减少来求和所有元素向量(我从nvidia演示文稿中复制的其他人的代码)。现在的问题是,如何结合这两种方法?我想要一堆线程来计算c的每个元素中的所有总和/乘法。有小费吗?还是我可以学习的类似问题? 矩阵中的行减少可能类似于该问题。但是我有多个不同长度和乘法的和。 这是nvidia员工提供的代码(我认为) 模板< unsigned int blockSize> __global__ void parreduc(float * array_in,float * reduc,size_t array_len) { extern volatile __shared__ float sdata []; size_t tid = threadIdx.x, gridSize = blockSize * gridDim.x, i = blockIdx.x * blockSize + tid; sdata [tid] = 0; 而(i< array_len) {sdata [tid] + = array_in [i]; i + = gridSize; } __syncthreads(); if(blockSize> = 512) {if(tid< 256)sdata [tid] + = sdata [tid + 256]; __syncthreads(); } if(blockSize> = 256) {if(tid< 128)sdata [tid] + = sdata [tid + 128]; __syncthreads(); } if(blockSize> = 128) {if(tid< 64)sdata [tid] + = sdata [tid + 64]; __syncthreads(); } if(tid< 32) {if(blockSize> = 64)sdata [tid] + = sdata [tid + 32]; if(blockSize> = 32)sdata [tid] + = sdata [tid + 16]; if(blockSize> = 16)sdata [tid] + = sdata [tid + 8]; if(blockSize> = 8)sdata [tid] + = sdata [tid + 4]; if(blockSize> = 4)sdata [tid] + = sdata [tid + 2]; if(blockSize> = 2)sdata [tid] + = sdata [tid + 1]; } if(tid == 0)reduct [blockIdx.x] = sdata [0]; } 解决方案问题中提供的代码是实现明智的第一步。线程策略是最常见/典型的:每个输出点分配一个线程(此处为 N 个输出点)。每个线程必须执行计算特定输出点所需的所有计算。改善CUDA代码性能的动机应始终解决至少2个CUDA优化优先级: 公开足够的并行性(大致:创建足够的线程) 有效利用内存(大致:为了全局内存访问,争取合并) 关于项目1,问题中提供的代码的有效性将取决于GPU。根据经验,我们希望在运行的GPU中为每个SM至少启动2048个线程(在Turing上为1024),从而有机会使GPU饱和。对于 N = 10000,我们可以使具有5个SM的GPU饱和。对于具有80个SM的Tesla V100,我们没有希望使具有10,000个线程的GPU饱和。 关于第2项,所提供的代码也有些不足度;在合并时会遇到问题:在许多情况下,相邻线程不会读取内存中的相邻值。仅举一个例子,我看到的第一个全局负载是 a [j] 。这将为每个线程加载相同的值/位置,而不是在相邻线程中加载相邻的值。 我们能否提出一个替代的实现,这可能会改善这两个方面?我们将考虑线程策略的以下更改:每个输出点分配一个线程块,而不是每个输出点分配一个线程。每个输出点所需的计算将可视化为矩阵的一个行。线程块将沿着该行跨越,执行所需的计算,并最终进行线程块级别的缩减以产生该行的单个结果。这将使我们能够解决这两个问题:经线中的相邻线程将能够从 a 和 b ,并且我们还可以立即将线程总数增加多达1024倍(因此,我们可以增加约1000万个线程,而不是一万个线程。一千万个足以饱和任何当前CUDA GPU)。该线程策略还具有另一个不错的功能:上述计算的行具有不同的长度。第一行和最后一行将是最长的,具有大约 N 个计算元素,而中间的行将更接近 N / 2 计算元素。通过选择跨步循环(在概念上类似于 grid-stride循环),我们可以有效地处理变化的行长。每个线程块仅在需要时才沿行跨越,从而累积结果。 以下是该实现的可行示例: $ cat t1497.cu #include< stdio.h> #include< iostream> #include< cuda.h> typedef float mt; #include< time.h> #include< sys / time.h> #定义USECPSEC 1000000ULL const bool sync = true; const bool nosync = false; unsigned long long dtime_usec(unsigned long long start,布尔use_sync = nosync){ if(use_sync == sync)cudaDeviceSynchronize(); timeval电视; gettimeofday(& tv,0); return((tv.tv_sec * USECPSEC)+ tv.tv_usec)-开始; } __global__ void chebyprod(int n,const mt * __restrict__ a,const mt * __restrict__ b,mt * __restrict__ c){ int i = blockIdx.x * blockDim.x + threadIdx 。X; 公吨金额; if(i< n){ sum = 0.f; (int j = 0; j sum + = a [j] * b [i-j]; } for(int j = 1; j< ni; j ++){ sum + = a [j] * b [j + i] + a [j + i] * b [j]; } c [i] = 0.5f * sum; } } //假设每个c_k系数一个线程块 //假设2幂的线程块大小 const int tpb_p2 = 8; const int nTPB = 1 const unsigned row_mask =〜(((0xFFFFFFFFU> tpb_p2)< tpb_p2); __global__ void chebyprod_imp(int n,const mt * __restrict__ a,const mt * __restrict__ b,mt * __restrict__ c){ #ifndef NO_WS __shared__ mt sd [32] ; if(threadIdx.x< 32)sd [threadIdx.x] = 0; __syncthreads(); #else __shared__ mt sd [nTPB]; #endif int k = blockIdx.x; 吨总和= 0.0f; int row_width =((((k)>(n-k))?(k):( n-k))+ 1; int stride =(row_width>> tpb_p2)+((row_width& row_mask)?1:0); int j = threadIdx.x; mt tmp_a; for(int s = 0; s< strides; s ++){//跨步循环 if(j if(j< = k)sum + = tmp_a * b [k-j]; if(((j> 0)&&(j<(n-k)))sum + = tmp_a * b [j + k] + a [j + k] * b [j]; j + = nTPB; } #ifndef NO_WS //减少第一次扭曲混合 int lane = threadIdx.x& (warpSize-1); int warpID = threadIdx.x>> 5; //假设warpSize == 32 unsigned mask = 0xFFFFFFFFU; for(int偏移量= warpSize>> 1;偏移量> 0;偏移量>> == 1) sum + = __shfl_down_sync(mask,sum,offset); if(lane == 0)sd [warpID] = sum; __syncthreads(); //将warp结果放入共享mem // //之后,仅扭曲0 if(warpID == 0){ //如果warp存在,则从共享mem重新加载val sum = sd [lane]; //最终减少扭曲变形(int offset = warpSize>> 1; offset> 0; offset>> == 1) sum + = __shfl_down_sync(mask,总和,偏移量); } #else sd [threadIdx.x] =总和; for(int s = nTPB> 1; s> 0; s>> = 1){//扫描减少 __syncthreads(); if(threadIdx.x< s)sd [threadIdx.x] + = sd [threadIdx.x + s];} if(!threadIdx.x)sum = sd [0]; #endif if(!threadIdx.x)c [k] = sum * 0.5f; } int main(int argc,char * argv []){ int N = 10000; 如果(argc> 1)N = atoi(argv [1]); std :: cout<< N =<< N << std :: endl; mt * a,* b,* c,* ic,* d_a,* d_b,* d_c; a =(mt *)malloc(N * sizeof(mt)); b =(mt *)malloc(N * sizeof(mt)); c =(mt *)malloc(N * sizeof(mt)); ic =(mt *)malloc(N * sizeof(mt)); cudaMalloc(& d_a,N * sizeof(mt)); cudaMalloc(& d_b,N * sizeof(mt)); cudaMalloc(& d_c,N * sizeof(mt)); (int i = 0; i a [i] = 0.1f; b [i] = 0.2f; } cudaMemcpy(d_a,a,N * sizeof(mt),cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,N * sizeof(mt),cudaMemcpyHostToDevice); int blockSize,gridSize; //每个线程块中的线程数 blockSize = 1024; //网格中的线程块数 gridSize =(int)ceil((float)N / blockSize); std :: cout<< blockSize:<< blockSize<< \ngridSize:<< gridSize<< \n; //对N个元素执行chebyprod unsigned long long dt = dtime_usec(0); chebyprod<< gridSize,blockSize>>(N,d_a,d_b,d_c); dt = dtime_usec(dt,sync); cudaMemcpy(c,d_c,N * sizeof(mt),cudaMemcpyDeviceToHost); printf(耗时:%fs\n,dt /(float)USECPSEC); std :: cout<< cudaGetErrorString(cudaGetLastError())<< std :: endl; std :: cout<< 向量c:[; (int k = 0; k std :: cout<< c [k]<< ; std :: cout<<] \n; dt = dtime_usec(0); chebyprod_imp<<< N,nTPB>>(N,d_a,d_b,d_c); dt = dtime_usec(dt,sync); cudaMemcpy(ic,d_c,N * sizeof(mt),cudaMemcpyDeviceToHost); printf(耗时:%fs\n,dt /(float)USECPSEC); std :: cout<< cudaGetErrorString(cudaGetLastError())<< std :: endl; std :: cout<< 向量c:[; (int k = 0; k std :: cout<< ic [k]<< ; std :: cout<<] \n; mt max_error = 0; for(int k = 0; k max_error = fmax(max_error,fabs(c [k]-ic [k])); std :: cout<< 最大错误=<< max_error<< std :: endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 免费(a); 免费(b); 免费(c); 免费(ic); } $ nvcc -arch = sm_52 -o t1497 t1497.cu $ ./t1497 blockSize:1024 gridSize:10 花费的时间: 0.001687s 无错误向量c:[199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906] 花费的时间:0.000350s 无错误向量c:[ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9] 最大错误= 0.0137787 (更改 -arch 开关以匹配您的GPU) 上面的示例显示了修改后的算法(在Tesla V100上)运行速度提高了约5倍。尽管存在数值差异,但这是由于浮点问题引起的。为了证明算法给出了正确的结果,请将 typedef 从 float 切换为 double 。您将看到结果基本上不再存在任何数值差异(这表明算法在逻辑上是相同的),并且改进后的 float 算法版本提供了在数值上更接近使用 double 算术产生的更准确结果的前10个元素的答案。 如评论中所述,这种算法转换可能并非在每种情况下都是有益的。主要好处将来自开发具有更大线程容量(大于 N 线程)的GPU。相对较小的GPU(例如,对于 N = 10000,可能为8个SM或更少),可能无法从中受益,实际上,代码的运行速度可能会比原始算法慢。 p> 尽管我提到合并,但对于 N = 10000,此处的输入数据非常小(〜80K字节),可以容纳在大多数GPU的L2缓存中。一旦数据位于L2缓存中,低效的访问模式就不再是问题。因此,在这种情况下,该算法的主要好处可能归因于第1项。如果无法利用第1项,则该算法几乎没有收益。 出于测试目的,我使用了warp-stride循环创建了另一个版本。但是,在小型GPU上,它似乎并没有明显更快,而在V100上,它实际上却更慢: #include< stdio。 h> #include< iostream> #include< cuda.h> typedef float mt; #include< time.h> #include< sys / time.h> #定义USECPSEC 1000000ULL const bool sync = true; const bool nosync = false; unsigned long long dtime_usec(unsigned long long start,布尔use_sync = nosync){ if(use_sync == sync)cudaDeviceSynchronize(); timeval电视; gettimeofday(& tv,0); return((tv.tv_sec * USECPSEC)+ tv.tv_usec)-开始; } __global__ void chebyprod(int n,const mt * __restrict__ a,const mt * __restrict__ b,mt * __restrict__ c){ int i = blockIdx.x * blockDim.x + threadIdx 。X; 公吨金额; if(i< n){ sum = 0.f; (int j = 0; j sum + = a [j] * b [i-j]; } for(int j = 1; j< ni; j ++){ sum + = a [j] * b [j + i] + a [j + i] * b [j]; } c [i] = 0.5f * sum; } } //假设每个c_k系数一个翘曲 //假设线程块大小为32的倍数 const int nTPB = 32 * 8; const int warpSize_p2 = 5; //假设warpSize == 32 const int nWarps = nTPB>> warpSize_p2; const unsigned row_mask =〜(((0xFFFFFFFFU>> warpSize_p2)<< warpSize_p2); __global__ void chebyprod_imp(int n,const mt * __restrict__ a,const mt * __restrict__ b,mt * __restrict__ c){ int warpID = threadIdx.x>> warpSize_p2; int k = blockIdx.x *(nWarps)+ warpID; if(k< n){ mt sum = 0.0f; int lane = threadIdx.x& (warpSize-1); int row_width =((((k)>(n-k))?(k):( n-k))+ 1; int stride =(row_width>> warpSize_p2)+((row_width& row_mask)?1:0); int j =车道; mt tmp_a; for(int s = 0; s< strides; s ++){//跨步循环 if(j< n)tmp_a = a [j]; if(j< = k)sum + = tmp_a * b [k-j]; if(((j> 0)&&(j<(n-k)))sum + = tmp_a * b [j + k] + a [j + k] * b [j]; j + = warpSize; } //减少扭曲改组(int offset = warpSize>>>>>> 0; offset>>> == 1) sum + = __shfl_down_sync(0xFFFFFFFFU,总和,偏移量); if(lane == 0)c [k] = sum * 0.5f;} } int main(int argc,char * argv []){ int N = 10000; 如果(argc> 1)N = atoi(argv [1]); std :: cout<< N =<< N << std :: endl; mt * a,* b,* c,* ic,* d_a,* d_b,* d_c; a =(mt *)malloc(N * sizeof(mt)); b =(mt *)malloc(N * sizeof(mt)); c =(mt *)malloc(N * sizeof(mt)); ic =(mt *)malloc(N * sizeof(mt)); cudaMalloc(& d_a,N * sizeof(mt)); cudaMalloc(& d_b,N * sizeof(mt)); cudaMalloc(& d_c,N * sizeof(mt)); (int i = 0; i a [i] = 0.1f; b [i] = 0.2f; } cudaMemcpy(d_a,a,N * sizeof(mt),cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,N * sizeof(mt),cudaMemcpyHostToDevice); int blockSize,gridSize; //每个线程块中的线程数 blockSize = 1024; //网格中的线程块数 gridSize =(int)ceil((float)N / blockSize); std :: cout<< blockSize:<< blockSize<< \ngridSize:<< gridSize<< \n; //对N个元素执行chebyprod unsigned long long dt = dtime_usec(0); chebyprod<< gridSize,blockSize>>(N,d_a,d_b,d_c); dt = dtime_usec(dt,sync); cudaMemcpy(c,d_c,N * sizeof(mt),cudaMemcpyDeviceToHost); printf(耗时:%fs\n,dt /(float)USECPSEC); std :: cout<< cudaGetErrorString(cudaGetLastError())<< std :: endl; std :: cout<< 向量c:[; (int k = 0; k std :: cout<< c [k]<< ; std :: cout<<] \n; dt = dtime_usec(0); chebyprod_imp<<< (N / nWarps)+ 1,nTPB>>(N,d_a,d_b,d_c); dt = dtime_usec(dt,sync); cudaMemcpy(ic,d_c,N * sizeof(mt),cudaMemcpyDeviceToHost); printf(耗时:%fs\n,dt /(float)USECPSEC); std :: cout<< cudaGetErrorString(cudaGetLastError())<< std :: endl; std :: cout<< 向量c:[; (int k = 0; k std :: cout<< ic [k]<< ; std :: cout<<] \n; mt max_error = 0; for(int k = 0; k max_error = fmax(max_error,fabs(c [k]-ic [k])); std :: cout<< 最大错误=<< max_error<< std :: endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 免费(a); 免费(b); 免费(c); 免费(ic); } The product of two series of Chebyshev polynomials with coefficient a and b can be represented by the formulaThe problem is to parallelize this as much as possible.I have managed to use cuda to parallelize the formula above by simply applying one thread per vector element. Thus one thread performs the sums/multiplications.#include <stdio.h>#include <iostream>#include <cuda.h>#include <time.h>__global__ void chebyprod(int n, float *a, float *b, float *c){ int i = blockIdx.x *blockDim.x + threadIdx.x; float sum; if (i < n) { sum = 0.f; for (int j = 0; j<=i; j++){ sum += a[j]*b[j-i]; } for (int j = 1; j < n-i; j++){ sum += a[j]*b[j+i]+a[j+i]*b[j]; } c[i] = 0.5f*sum; } /* if (i < n) c[i] = a[i] + b[i]; */}int main(void){ clock_t tStart = clock(); int N = 10000; float *a, *b, *c, *d_a, *d_b, *d_c; a = (float*)malloc(N*sizeof(float)); b = (float*)malloc(N*sizeof(float)); c = (float*)malloc(N*sizeof(float)); cudaMalloc(&d_a, N*sizeof(float)); cudaMalloc(&d_b, N*sizeof(float)); cudaMalloc(&d_c, N*sizeof(float)); for (int i = 0; i < N; i++) { a[i] = 0.1f; b[i] = 0.2f; } cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice); int blockSize, gridSize; // Number of threads in each thread block blockSize = 1024; // Number of thread blocks in grid gridSize = (int)ceil((float)N/blockSize); std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n"; // Perform chebyprod on N elements chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c); printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC); cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost); std::cout << "Vector c: [ "; for (int k = 0; k < 10; ++k) std::cout << c[k] << " "; std::cout <<"]\n"; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c);}In another code I have managed to use sum reduction to sum all the elements in a vector (someone else's code I copied from an nvidia presentation). The problem is now, how do I combine the two approaches? I want a bunch of threads to compute all of the sums/multiplications in every element of c. Any tips? Or perhaps a problem similar that I can learn from?Row reduction in a matrix can possibly be similar to the problem. However I have multiple sums with different lengths and multiplications.This is the code provided by nvidia employee (I think)template <unsigned int blockSize>__global__ void parreduc(float *array_in, float *reduct, size_t array_len) { extern volatile __shared__ float sdata[]; size_t tid = threadIdx.x, gridSize = blockSize * gridDim.x, i = blockIdx.x * blockSize + tid; sdata[tid] = 0; while (i < array_len) { sdata[tid] += array_in[i]; i += gridSize; } __syncthreads(); if (blockSize >= 512) { if (tid < 256) sdata[tid] += sdata[tid + 256]; __syncthreads(); } if (blockSize >= 256) { if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads(); } if (blockSize >= 128) { if (tid < 64) sdata[tid] += sdata[tid + 64]; __syncthreads(); } if (tid < 32) { if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; } if (tid == 0) reduct[blockIdx.x] = sdata[0]; } 解决方案 The code provided in the question is a sensible first step in realization. The thread strategy is the most common/typical: to assign one thread per output point (N output points here). Each thread must perform all calculations necessary to compute a particular output point. Motivations to improve performance of CUDA code should always address at least 2 CUDA optimization priorities:Expose enough parallelism (roughly: create enough threads)Make efficient use of memory (roughly: for global memory access, strive for coalescing)With respect to item 1, the effectiveness of the code provided in the question will depend on the GPU. As a rough rule of thumb, we seek to launch at least 2048 threads (1024 on Turing) per SM in the GPU we are running on, to have a chance to "saturate" the GPU. For N = 10000, we can saturate a GPU with 5 SMs. For a Tesla V100, with 80 SMs, we have no hope of saturating that GPU with 10,000 threads.With respect to item 2, the code provided also falls short to some degree; it has issues when it comes to coalescing: adjacent threads in many cases are not reading adjacent values in memory. To pick just one example, the first global load I see there is a[j]. This is loading the same value/location per thread, rather than adjacent values in adjacent threads.Can we come up with an alternate realization that will possibly improve on both of these? We will consider the following change in thread strategy: assign one threadblock per output point, rather than one thread per output point. The calculations needed for each output point will be visualized as one "row" of a matrix. A threadblock will "stride" along the row, performing the calculations needed, and eventually doing a threadblock-level reduction to produce a single result for that row. This will allow us to address both items: adjacent threads in a warp will be able to read adjacent values from a and b, and we will also immediately be able to increase our total number of threads by a factor of up to 1024 (so, instead of 10 thousand threads, we could spin up ~10 million threads. 10 million is enough to saturate any current CUDA GPU). This thread strategy also has another nice feature: the "rows" of calculations mentioned above have varying length. The first and last rows will be the longest, with approximately N calculation elements, whereas the rows in the middle will have closer to N/2 calculation elements. By choosing a block-stride loop (conceptually similar to a grid-stride loop) we can efficiently handle varying row lengths. Each threadblock will "stride" along the row, only as far as needed, accumulating results.Here is a worked example of that realization:$ cat t1497.cu#include <stdio.h>#include <iostream>#include <cuda.h>typedef float mt;#include <time.h>#include <sys/time.h>#define USECPSEC 1000000ULLconst bool sync = true;const bool nosync = false;unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){ if (use_sync == sync) cudaDeviceSynchronize(); timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;}__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){ int i = blockIdx.x *blockDim.x + threadIdx.x; mt sum; if (i < n) { sum = 0.f; for (int j = 0; j<=i; j++){ sum += a[j]*b[i-j]; } for (int j = 1; j < n-i; j++){ sum += a[j]*b[j+i]+a[j+i]*b[j]; } c[i] = 0.5f*sum; }}// assume one threadblock per c_k coefficient// assume a power-of-2 threadblock sizeconst int tpb_p2 = 8;const int nTPB = 1<<tpb_p2;const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){#ifndef NO_WS __shared__ mt sd[32]; if (threadIdx.x < 32) sd[threadIdx.x] = 0; __syncthreads();#else __shared__ mt sd[nTPB];#endif int k = blockIdx.x; mt sum = 0.0f; int row_width = (((k)>(n-k))?(k):(n-k))+1; int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0); int j = threadIdx.x; mt tmp_a; for (int s=0; s < strides; s++){ // block-stride loop if (j < n) tmp_a = a[j]; if (j <= k) sum += tmp_a*b[k-j]; if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j]; j += nTPB; }#ifndef NO_WS // 1st warp-shuffle reduction int lane = threadIdx.x & (warpSize-1); int warpID = threadIdx.x >> 5; // assumes warpSize == 32 unsigned mask = 0xFFFFFFFFU; for (int offset = warpSize>>1; offset > 0; offset >>= 1) sum += __shfl_down_sync(mask, sum, offset); if (lane == 0) sd[warpID] = sum; __syncthreads(); // put warp results in shared mem // hereafter, just warp 0 if (warpID == 0){ // reload val from shared mem if warp existed sum = sd[lane]; // final warp-shuffle reduction for (int offset = warpSize>>1; offset > 0; offset >>= 1) sum += __shfl_down_sync(mask, sum, offset); }#else sd[threadIdx.x] = sum; for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction __syncthreads(); if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];} if (!threadIdx.x) sum = sd[0];#endif if (!threadIdx.x) c[k] = sum*0.5f;}int main(int argc, char *argv[]){ int N = 10000; if (argc>1) N = atoi(argv[1]); std::cout << "N = " << N << std::endl; mt *a, *b, *c, *ic, *d_a, *d_b, *d_c; a = (mt*)malloc(N*sizeof(mt)); b = (mt*)malloc(N*sizeof(mt)); c = (mt*)malloc(N*sizeof(mt)); ic = (mt*)malloc(N*sizeof(mt)); cudaMalloc(&d_a, N*sizeof(mt)); cudaMalloc(&d_b, N*sizeof(mt)); cudaMalloc(&d_c, N*sizeof(mt)); for (int i = 0; i < N; i++) { a[i] = 0.1f; b[i] = 0.2f; } cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice); int blockSize, gridSize; // Number of threads in each thread block blockSize = 1024; // Number of thread blocks in grid gridSize = (int)ceil((float)N/blockSize); std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n"; // Perform chebyprod on N elements unsigned long long dt = dtime_usec(0); chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c); dt = dtime_usec(dt,sync); cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost); printf("Time taken: %fs\n", dt/(float)USECPSEC); std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl; std::cout << "Vector c: [ "; for (int k = 0; k < 10; ++k) std::cout << c[k] << " "; std::cout <<"]\n"; dt = dtime_usec(0); chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c); dt = dtime_usec(dt,sync); cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost); printf("Time taken: %fs\n", dt/(float)USECPSEC); std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl; std::cout << "Vector c: [ "; for (int k = 0; k < 10; ++k) std::cout << ic[k] << " "; std::cout <<"]\n"; mt max_error = 0; for (int k = 0; k < N; k++) max_error = fmax(max_error, fabs(c[k] - ic[k])); std::cout << "Max error = " << max_error << std::endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c); free(ic);}$ nvcc -arch=sm_52 -o t1497 t1497.cu$ ./t1497blockSize: 1024gridSize: 10Time taken: 0.001687sno errorVector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]Time taken: 0.000350sno errorVector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]Max error = 0.0137787$(change the -arch switch to match your GPU)The above example shows that the modified algorithm runs about 5x faster (on a Tesla V100). Although there are numerical differences, these are due to floating point issues. To prove that the algorithm gives the correct result, switch the typedef from float to double. You will see that there is then essentially no longer any numerical difference in the results (suggesting that the algorithms are logically the same) and also that the improved algorithm version in float resolution gives answers for the first 10 elements that are numerically closer to the "more accurate" result produced with double arithmetic.As discussed in the comments, this algorithm transformation may not be beneficial in every case. The principal benefit will come from exploiting GPUs with a larger thread capacity (larger than N threads). Relatively smaller GPUs (e.g. 8 SMs or less, perhaps, for N = 10000) may not benefit from this and in fact the code may run slower than the original algorithm.Although I mention coalescing, for N = 10000 the input data here is quite small (~80K bytes) which will fit in the L2 cache of most GPUs. Once the data is in the L2 cache, inefficient access patterns are much less of an issue. So the primary benefit of this algorithm in this case is probably due to item 1. If item 1 cannot be exploited, the algorithm shows little or no benefit.For test purposes, I created another version using a warp-stride loop. However it doesn't seem to be significantly faster on small GPUs and is actually slower on V100:#include <stdio.h>#include <iostream>#include <cuda.h>typedef float mt;#include <time.h>#include <sys/time.h>#define USECPSEC 1000000ULLconst bool sync = true;const bool nosync = false;unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){ if (use_sync == sync) cudaDeviceSynchronize(); timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;}__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){ int i = blockIdx.x *blockDim.x + threadIdx.x; mt sum; if (i < n) { sum = 0.f; for (int j = 0; j<=i; j++){ sum += a[j]*b[i-j]; } for (int j = 1; j < n-i; j++){ sum += a[j]*b[j+i]+a[j+i]*b[j]; } c[i] = 0.5f*sum; }}// assume one warp per c_k coefficient// assume a multiple-of-32 threadblock sizeconst int nTPB = 32*8;const int warpSize_p2 = 5; // assumes warpSize == 32const int nWarps = nTPB>>warpSize_p2;const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){ int warpID = threadIdx.x >> warpSize_p2; int k = blockIdx.x*(nWarps)+warpID; if (k < n){ mt sum = 0.0f; int lane = threadIdx.x & (warpSize-1); int row_width = (((k)>(n-k))?(k):(n-k))+1; int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0); int j = lane; mt tmp_a; for (int s=0; s < strides; s++){ // warp-stride loop if (j < n) tmp_a = a[j]; if (j <= k) sum += tmp_a*b[k-j]; if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j]; j += warpSize; } // warp-shuffle reduction for (int offset = warpSize>>1; offset > 0; offset >>= 1) sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset); if (lane==0) c[k] = sum*0.5f;}}int main(int argc, char *argv[]){ int N = 10000; if (argc>1) N = atoi(argv[1]); std::cout << "N = " << N << std::endl; mt *a, *b, *c, *ic, *d_a, *d_b, *d_c; a = (mt*)malloc(N*sizeof(mt)); b = (mt*)malloc(N*sizeof(mt)); c = (mt*)malloc(N*sizeof(mt)); ic = (mt*)malloc(N*sizeof(mt)); cudaMalloc(&d_a, N*sizeof(mt)); cudaMalloc(&d_b, N*sizeof(mt)); cudaMalloc(&d_c, N*sizeof(mt)); for (int i = 0; i < N; i++) { a[i] = 0.1f; b[i] = 0.2f; } cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice); int blockSize, gridSize; // Number of threads in each thread block blockSize = 1024; // Number of thread blocks in grid gridSize = (int)ceil((float)N/blockSize); std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n"; // Perform chebyprod on N elements unsigned long long dt = dtime_usec(0); chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c); dt = dtime_usec(dt,sync); cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost); printf("Time taken: %fs\n", dt/(float)USECPSEC); std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl; std::cout << "Vector c: [ "; for (int k = 0; k < 10; ++k) std::cout << c[k] << " "; std::cout <<"]\n"; dt = dtime_usec(0); chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c); dt = dtime_usec(dt,sync); cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost); printf("Time taken: %fs\n", dt/(float)USECPSEC); std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl; std::cout << "Vector c: [ "; for (int k = 0; k < 10; ++k) std::cout << ic[k] << " "; std::cout <<"]\n"; mt max_error = 0; for (int k = 0; k < N; k++) max_error = fmax(max_error, fabs(c[k] - ic[k])); std::cout << "Max error = " << max_error << std::endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c); free(ic);} 这篇关于Cuda-每个向量元素中的多个和的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持! 上岸,阿里云!
08-18 11:46