问题描述
我想尝试一个带有 x * y
窗口的中值过滤器,其中 x
和 y
是奇数和程序的参数。
I'm trying to do a median filter with a window x*y
where x
and y
are odd and parameters of the program.
我的想法是先看看我可以在一个块中执行多少个线程,我有多少共享内存可用,像这样:
My idea is first see how many threads I can execute in a single block and how much shared memory I have avaliable, like this:
void cudaInit(int imgX, int imgY, int kx, int ky, int* cudaVars){
int device;
int deviceCount;
cudaDeviceProp deviceProp;
cudaGetDevice(&device);
cudaGetDeviceProperties(&deviceProp, device);
int kxMed = kx/2;
int kyMed = ky/2;
int n = deviceProp.maxThreadsPerBlock;
while(f(n,kxMed,kyMed)>deviceProp.sharedMemPerBlock){
n = n/2;
}
cudaVars[0] = n;
cudaVars[1] = imgX/cudaVars[0];
cudaVars[2] = imgY/cudaVars[0];
}
}
void mediaFilter_gpuB(uchar4* h_img,int width, int height, int kx, int ky){
assert(h_img!=NULL && width!=0 && height!=0);
int dev=0;
cudaDeviceProp deviceProp;
//DEVICE
uchar4* d_img;
uchar4* d_buf;
int cudaVars[3]={0};
cudaInit(width,height,kx,ky,cudaVars);
checkCudaErrors(cudaMalloc((void**) &(d_img), width*height*sizeof(unsigned char)*4));
checkCudaErrors(cudaMalloc((void**) &(d_buf), width*height*sizeof(unsigned char)*4));
cudaGetDevice(&dev);
cudaGetDeviceProperties(&deviceProp,dev);
checkCudaErrors(cudaMemcpy(d_img, h_img, width*height*sizeof(uchar4), cudaMemcpyHostToDevice));
dim3 dimGrid(cudaVars[1],cudaVars[2],1);
dim3 threads(cudaVars[0],1,1);
mediaFilterB<<<dimGrid,threads,f(cudaVars[0],kx/2,ky/2)>>>(d_buf,d_img,width,height, kx,ky,cudaVars[0]);
checkCudaErrors(cudaMemcpy(h_img, d_buf, width*height*sizeof(uchar4), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_img));
checkCudaErrors(cudaFree(d_buf));
}
__device__ void fillSmem(int* sMem, uchar4* buf, int width, int height, int kx, int ky){
int kyMed=ky/2;
int kxMed=kx/2;
int sWidth = 2*kxMed+gridDim.x;
int sHeight =2*kyMed+gridDim.x;
int X = blockIdx.x*gridDim.x+threadIdx.x;
int Y = blockIdx.y*gridDim.y;
int j=0;
while(threadIdx.x+j < sHeight){
for(int i=0;i<sWidth;i++){
sMem[threadIdx.x*gridDim.x+gridDim.x*j+i] = buf[X + i + (threadIdx.x + Y)*width + j*width].x;
}
j++;
}
}
c> mediaFilterB ,我只是将全局内存复制到共享内存,但它需要很多时间,即在图像中大约 5
8000 * 8000
像素。另一方面,没有CUDA的顺序算法需要 23
秒来计算图像的中值滤波器。
For the moment, in the function mediaFilterB
, I only copy global memory to shared memory, but it takes a lot of time, i.e., about 5
seconds in a image of 8000*8000
pixels. On the other hand, the sequential algorithm without CUDA takes 23
seconds to calculate the median filter of the image.
我知道我在做错误的过程中将全局内存复制到共享内存,我的算法效率很低,但我不知道如何纠正它。
I know that I`m doing something wrong in the process to copy global memory to shared memory and that my algorithm is very inefficient, but I don't know how I can correct it.
推荐答案
我提供此问题的答案,以从未回答的列表中删除它。
I'm providing an answer to this question to remove it from the unanswered list.
如何使用共享内存来改进CUDA的中值滤波是由Accelereyes开发的代码,可以从以下帖子下载:
The classical example on how using shared memory to improve median filtering with CUDA is the code developed by Accelereyes and made available for download from the following post:
这个想法是分配一个(BLOCK_WIDTH + 2)x(BLOCK_HEIGHT + 2)
大小的共享内存。在第一步骤,外部元件被归零。这些元素只有在它们对应于真正的图像元素时才会填充全局内存值,否则它们将保持为零。
The idea is to allocate a (BLOCK_WIDTH+2)x(BLOCK_HEIGHT+2)
sized shared memory. At a first step, the outer elements are zeroed. These elements will be filled with global memory values only if they correspond to true image elements, otherwise they will remain zero for padding.
为了方便起见,提供完整的工作代码。
For the sake of convenience, I'm providing below a full working code.
#include <iostream>
#include <fstream>
using namespace std;
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
const int tx_l = threadIdx.x; // --- Local thread x index
const int ty_l = threadIdx.y; // --- Local thread y index
const int tx_g = blockIdx.x * blockDim.x + tx_l; // --- Global thread x index
const int ty_g = blockIdx.y * blockDim.y + ty_l; // --- Global thread y index
__shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];
// --- Fill the shared memory border with zeros
if (tx_l == 0) smem[tx_l] [ty_l+1] = 0; // --- left border
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+1] = 0; // --- right border
if (ty_l == 0) { smem[tx_l+1][ty_l] = 0; // --- upper border
if (tx_l == 0) smem[tx_l] [ty_l] = 0; // --- top-left corner
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l] = 0; // --- top-right corner
} else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2] = 0; // --- bottom border
if (tx_l == 0) smem[tx_l] [ty_l+2] = 0; // --- bottom-left corder
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2] = 0; // --- bottom-right corner
}
// --- Fill shared memory
smem[tx_l+1][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g]; // --- center
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1]; // --- left border
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1]; // --- right border
if ((ty_l == 0)&&(ty_g > 0)) { smem[tx_l+1][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g]; // --- upper border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- top-left corner
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g+1]; // --- top-right corner
} else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) { smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g]; // --- bottom border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- bottom-left corder
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1]; // --- bottom-right corner
}
__syncthreads();
// --- Pull the 3x3 window in a local array
unsigned short v[9] = { smem[tx_l][ty_l], smem[tx_l+1][ty_l], smem[tx_l+2][ty_l],
smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1], smem[tx_l+2][ty_l+1],
smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2], smem[tx_l+2][ty_l+2] };
// --- Bubble-sort
for (int i = 0; i < 5; i++) {
for (int j = i + 1; j < 9; j++) {
if (v[i] > v[j]) { // swap?
unsigned short tmp = v[i];
v[i] = v[j];
v[j] = tmp;
}
}
}
// --- Pick the middle one
Output_Image[ty_g*Image_Width + tx_g] = v[4];
}
/********/
/* MAIN */
/********/
int main()
{
const int Image_Width = 1580;
const int Image_Height = 1050;
// --- Open data file
ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );
// --- Get file length
is.seekg(0, ios::end);
int dataLength = is.tellg();
is.seekg(0, ios::beg);
// --- Read data from file and close file
unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
is.read((char*)Input_Image_Host,dataLength);
is.close();
// --- CUDA warm up
unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
gpuErrchk(cudaFree(forFirstCudaMalloc));
// --- Allocate host and device memory spaces
unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short)));
unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short)));
// --- Copy data from host to device
gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering
// --- Grid and block sizes
const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);
const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));
// --- Open results file, write results and close the file
ofstream of2; of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw", ios::binary);
of2.write((char*)Output_Image_Host, dataLength);
of2.close();
cout << "\n Press Any Key To Exit..!!";
gpuErrchk(cudaFree(Input_Image));
delete Input_Image_Host;
delete Output_Image_Host;
return 0;
}
这篇关于CUDA中的2D中值滤波:如何有效地将全局内存复制到共享内存的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!