问题描述
我在CUDA中实现了一个二维中值滤波器,整个程序如下所示。
#includecuda_runtime.h
#includecuda_runtime_api.h
#includedevice_launch_parameters.h
#include< iostream>
#include< fstream>
#include< iomanip>
#include< windows.h>
#include< io.h>
#include< stdio.h>
#include< conio.h>
#include< cstdlib>
#includecstdlib
#include< process.h>
#include< stdlib.h>
#include< malloc.h>
#include< ctime>
using namespace std;
#define MEDIAN_DIMENSION 3 //用于3 x 3的矩阵。我们可以使用5 x 5,7 x 7,9 x 9 ......
#define MEDIAN_LENGTH 9 / / Shoul be MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3
#define BLOCK_WIDTH 16 //应为8如果矩阵大于5 x 5 elese错误发生,因为在环绕处使用过多的共享数据 [BLOCK_WIDTH * BLOCK_HEIGHT] [MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16 //应为8如果矩阵大于5 x 5 elese错误发生,因为环绕声使用过多的共享数据[BLOCK_WIDTH * BLOCK_HEIGHT] MEDIAN_LENGTH]
__global__ void MedianFilter_gpu(unsigned short * Device_ImageData,int Image_Width,int Image_Height){
__shared__无符号短环绕声[BLOCK_WIDTH * BLOCK_HEIGHT] [MEDIAN_LENGTH]
int iterator;
const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH / 2)+1;
int StartPoint = MEDIAN_DIMENSION / 2;
int EndPoint = StartPoint + 1;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.y + threadIdx.x;
if(x> = Image_Width || y> = Image_Height)
return;
//在MEDIAN_DIMENSION x MEDIAN_DIMENSION
的Matrix Pettern中填充图像的像素值填充环绕if(x == 0 || x == Image_Width - StartPoint || y == 0
|| y == Image_Height - StartPoint){
} else {
iterator = 0;
for(int r = x-StartPoint; r for(int c = y - StartPoint; c surround [tid] [iterator] = *(Device_ImageData +(c * Image_Width)+ r);
iterator ++;
}
}
//将Surround数组排序以查找中值。使用泡沫短如果矩阵oF 3 x 3矩阵
//您可以使用下面注释的插入缩短大尺寸矩阵
// //泡泡短//
for(int i = 0; i {
//找到最小元素的位置
int min =
for(int l = i + 1; l if(surround [tid] [l]< surround [tid] [min])
min = l;
//将找到的最小元素放在其位置
unsigned short temp = surround [tid] [i];
surround [tid] [i] = surround [tid] [min];
surround [tid] [min] = temp;
} // bubble short end
//////插入排序开始//
/ * int t,j,i;
for(i = 1; i j = i;
while(j> 0&& amp;& chen [tid] [j]< surround [tid] [j-1]){
t = surround [tid] [j]
surround [tid] [j] = surround [tid] [j-1];
surround [tid] [j-1] = t;
j--;
}
} * /
//// insert sort end
*(Device_ImageData +(y * Image_Width )+ x)= surround [tid] [Half_Of_MEDIAN_LENGTH-1]; //如果使用3 x 3矩阵,它将给出surround [tid] [4]的值作为中位数值
__syncthreads();
}
}
int main(int argc,const char ** argv)
{
int dataLength;
int p1;
unsigned short * Host_ImageData = NULL;
ifstream is; //读文件
is.open(D:\\Image_To_Be_Filtered.raw,ios :: binary);
//获取文件长度:
is.seekg(0,ios :: end);
dataLength = is.tellg();
is.seekg(0,ios :: beg);
Host_ImageData = new unsigned short [dataLength * sizeof(char)/ sizeof(unsigned short)];
is.read((char *)Host_ImageData,dataLength);
is.close();
int Image_Width = 1580;
int Image_Height = 1050;
unsigned short * Host_ResultData =(unsigned short *)malloc(dataLength);
unsigned short * Device_ImageData = NULL;
///////////////////////////////
//第一次cudaMalloc需要更多的时间对于内存分配,我不想在这个时候我的过程中的cosider。
//所以请忽略显示第一个CudaMelloc时间的代码
clock_t begin = clock();
unsigned short * forFirstCudaMalloc = NULL;
cudaMalloc((void **)& forFirstCudaMalloc,dataLength * sizeof(unsigned short));
clock_t end = clock();
double elapsed_secs = double(end - begin)/ CLOCKS_PER_SEC;
cout<<First CudaMelloc time =<< elapsed_secs<<Second\\\
;
cudaFree(forFirstCudaMalloc);
////////////////////////////
//实际过程从这里开始
clock_t beginOverAll = clock(); //
cudaMalloc((void **)& Device_ImageData,dataLength * sizeof(unsigned short));
cudaMemcpy(Device_ImageData,Host_ImageData,dataLength,cudaMemcpyHostToDevice); //将主机数据复制到设备内存进行过滤
int x = static_cast< int>(ceilf(static_cast< float>(1580.0) / BLOCK_WIDTH));
int y = static_cast< int>(ceilf(static_cast< float>(1050.0)/ BLOCK_HEIGHT));
const dim3 grid(x,y,1);
const dim3 block(BLOCK_WIDTH,BLOCK_HEIGHT,1);
begin = clock();
MedianFilter_gpu<<< grid,block>>>(Device_ImageData,Image_Width,Image_Height);
cudaDeviceSynchronize();
end = clock();
elapsed_secs = double(end - begin)/ CLOCKS_PER_SEC;
cout<<Process time =<< elapsed_secs<<Second\\\
;
cudaMemcpy(Host_ResultData,Device_ImageData,dataLength,cudaMemcpyDeviceToHost); //将设备数据复制到主机内存写入In文件过滤后完成
clock_t endOverall = clock();
elapsed_secs = double(endOverall - beginOverAll)/ CLOCKS_PER_SEC;
cout<<Complete Time =<< elapsed_secs<<Second\\\
;
ofstream of2; //将过滤的图像写入文件
of2.open(D:\\Filtered_Image.raw,ios :: binary);
of2.write((char *)Host_ResultData,dataLength);
of2.close();
cout<\ nEnd of Writing File。按任意键退出.. !!;
cudaFree(Device_ImageData);
delete Host_ImageData;
delete Host_ResultData;
getch();
return 0;
}
是我使用的文件的链接。我使用以原始格式存储图像,同样用于阅读原始图像。我的图像像素是 16
位,无符号短
。图片的宽度为 1580
,高度为 1050
。
我坚信,通过使用正确的CUDA优化,过滤器可以更高效和更快。
运行在GeForce GT 520M卡上,时间如下
1) MEDIAN_DIMENSION
c> 3 x 3 = 0.027秒
2) MEDIAN_DIMENSION
c $ c> 5 x 5 = 0.206秒
3) MEDIAN_DIMENSION
7 x 7 = 1.11秒
4) MEDIAN_DIMENSION
9 x 9 = 4.931秒
正如你所看到的, MEDIAN_DIMENSION ,时间增加非常多,我有应用程序,我通常使用更高的 MEDIAN_DIMENSION
像 7 x 7
和 9 x 9
。我认为,通过使用Cuda,即使 9 x 9
,时间应该小于 1秒
。
因为我认为排序部分在这里占用大部分时间,我们可以使算法的排序部分更快吗?
我们可以更有效地使用 grid
和 block
吗?我可以使用更大的 BLOCK_WIDTH
和 BLOCK_HEIGHT
(如 32
32
),仍未达到最大 __共享__
内存限制为 4Kb $ c
可以 __共享__
更高效地使用内存吗?
任何帮助都会感激。
提前感谢。
我回答你关于使用共享内存的最后一个问题。
$ b $
我正在比较你的解决方案,对于 3x3
情况,您的内核的一个变体不使用共享内存,以及在。
以下是完整的代码:
include< iostream>
#include< fstream>
using namespace std;
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
/ ******************* /
/ * iDivUp FUNCTION * /
/ ******************* /
int iDivUp(int a,int b){return (a%b)!= 0)? (a / b + 1):(a / b); }
/ ******************** /
/ * CUDA错误检查* /
/ *** ***************** /
#define gpuErrchk(ans){gpuAssert((ans),__FILE__,__LINE__); }
inline void gpuAssert(cudaError_t code,char * file,int line,bool abort = true)
{
if(code!= cudaSuccess)
{
fprintf (stderr,GPUassert:%s%s%d\\\
,cudaGetErrorString(code),file,line);
if(abort)exit(code);
}
}
/ ***************************** *************** /
/ * KERNEL优化使用共享内存* /
/ ************** ******************************** /
__global__ void Optimized_Kernel_Function_shared(unsigned short * Input_Image,unsigned short * Output_Image ,int Image_Width,int Image_Height)
{
const int tx_l = threadIdx.x; // --- Local thread x index
const int ty_l = threadIdx.y; // --- Local thread y index
const int tx_g = blockIdx.x * blockDim.x + tx_l; // ---全局线程x索引
const int ty_g = blockIdx.y * blockDim.y + ty_l; // ---全局线程y索引
__shared__无符号短整型[BLOCK_WIDTH + 2] [BLOCK_HEIGHT + 2];
// ---用零填充共享内存边界
if(tx_l == 0)smem [tx_l] [ty_l + 1] = 0; // --- left border
else if(tx_l == BLOCK_WIDTH-1)smem [tx_l + 2] [ty_l + 1] = 0; // --- right border
if(ty_l == 0){smem [tx_l + 1] [ty_l] = 0; // --- upper border
if(tx_l == 0)smem [tx_l] [ty_l] = 0; // --- top-left corner
else if(tx_l == BLOCK_WIDTH-1)smem [tx_l + 2] [ty_l] = 0; // --- top-right corner
} else if(ty_l == BLOCK_HEIGHT-1){smem [tx_l + 1] [ty_l + 2] = 0; // --- bottom border
if(tx_l == 0)smem [tx_l] [ty_l + 2] = 0; // --- bottom-left corder
else if(tx_l == BLOCK_WIDTH-1)smem [tx_l + 2] [ty_l + 2] = 0; // --- bottom-right corner
}
// ---填充共享内存
smem [tx_l + 1] [ty_l + 1] = Input_Image [ty_g * Image_Width + tx_g]; // --- center
if((tx_l == 0)&&((tx_g> 0)))smem [tx_l] [ty_l + 1] = Input_Image [ty_g * Image_Width + tx_g-1 ]; // --- left border
else if((tx_l == BLOCK_WIDTH-1)&(tx_g< Image_Width -1))smem [tx_l + 2] [ty_l + 1] = Input_Image [ty_g * Image_Width + tx_g + 1]; // --- right border
if((ty_l == 0)&(ty_g> 0)){smem [tx_l + 1] [ty_l] = Input_Image [(ty_g-1)* Image_Width + tx_g]; // ... upper border
if((tx_l == 0)&((tx_g> 0)))smem [tx_l] [ty_l] = Input_Image [(ty_g-1)* Image_Width + tx_g-1] // --- top-left corner
else if((tx_l == BLOCK_WIDTH-1)&&(tx_g< Image_Width - 1))smem [tx_l + 2] [ty_l] = Input_Image [ ty_g-1)* Image_Width + tx_g + 1]; // --- top-right corner
} else if((ty_l == BLOCK_HEIGHT-1)&&(ty_g< Image_Height - 1)){smem [tx_l + 1] [ty_l + 2] = Input_Image [(ty_g + 1)* Image_Width + tx_g]; // --- bottom border
if((tx_l == 0)&&((tx_g> 0)))smem [tx_l] [ty_l + 2] = Input_Image [(ty_g-1)* Image_Width + tx_g-1]; // --- bottom-left corder
else if((tx_l == BLOCK_WIDTH-1)&&(tx_g< Image_Width -1))smem [tx_l + 2] [ty_l + 2] = Input_Image [(ty_g + 1)* Image_Width + tx_g + 1]; // --- bottom-right corner
}
__syncthreads();
// ---在本地数组中拉3x3窗口
unsigned short v [9] = {smem [tx_l] [ty_l],smem [tx_l + 1] [ty_l] ,smem [tx_l + 2] [ty_l],
smem [tx_l] [ty_l + 1],smem [tx_l + 1] [ty_l + 1],smem [tx_l + 2] [ty_l + 1],
smem [tx_l] [ty_l + 2],smem [tx_l + 1] [ty_l + 2],smem [tx_l + 2] [ty_l + 2]};
// --- Bubble-sort
for(int i = 0; i for(int j = i + 1; j< ; 9; j ++){
if(v [i]> v [j]){// swap?
unsigned short tmp = v [i];
v [i] = v [j];
v [j] = tmp;
}
}
}
// ---选中间一个
Output_Image [ty_g * Image_Width + tx_g] = v [4]
}
/ **************************** /
/ * ORIGINAL KERNEL FUNCTION * /
/ **************************** /
__global__ void Original_Kernel_Function(unsigned short * Input_Image ,unsigned short * Output_Image,int Image_Width,int Image_Height){
__shared__ unsigned short surround [BLOCK_WIDTH * BLOCK_HEIGHT] [9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if((x> =(Image_Width -1))||(y> = Image_Height_1)||(x == 0)|| ;
// ---填充共享内存
iterator = 0;
for(int r = x-1; r for(int c = y-1; c surround [tid] [iterator] = Input_Image [c * Image_Width + r];
iterator ++;
}
}
// ---对共享内存进行排序以使用Bubble Short
for(int i = 0; i
// ---找到最小元素的位置
int minval = i;
for(int l = i + 1; l
// ---将找到的最小元素放在其位置
unsigned short temp = surround [tid] [i];
surround [tid] [i] = surround [tid] [minval];
surround [tid] [minval] = temp;
}
// ---选择中间一个
Output_Image [(y * Image_Width)+ x] = surround [tid] [4];
__syncthreads();
}
/ ******************************** *************** /
/ *原始KERNEL功能 - 没有共享内存* /
/ ************** ********************************* /
__global__ void Original_Kernel_Function_no_shared(unsigned short * Input_Image,unsigned short * Output_Image,int Image_Width,int Image_Height){
unsigned short surround [9];
int iterator;
const int x = BlockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if((x> =(Image_Width -1))||(y> = Image_Height_1)||(x == 0)|| ;
// ---向线程填充数组private
iterator = 0;
for(int r = x-1; r for(int c = y-1; c surround [iterator] = Input_Image [c * Image_Width + r];
iterator ++;
}
}
// ---对私有数组进行排序以使用Bubble Short
for(int i = 0; i
// ---找到最小元素的位置
int minval = i;
for(int l = i + 1; l
// ---将找到的最小元素放在其位置
unsigned short temp = surround [i];
surround [i] = surround [minval];
surround [minval] = temp;
}
// ---选择中间一个
Output_Image [(y * Image_Width)+ x] = surround [4];
}
/ ******** /
/ * MAIN * /
/ ******** /
int main()
{
const int Image_Width = 1580;
const int Image_Height = 1050;
// ---打开数据文件
ifstream is; is.open(C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw,ios :: binary );
// ---获取文件长度
is.seekg(0,ios :: end);
int dataLength = is.tellg();
is.seekg(0,ios :: beg);
// ---从文件读取数据并关闭文件
unsigned short * Input_Image_Host = new unsigned short [dataLength * sizeof(char)/ sizeof(unsigned short)];
is.read((char *)Input_Image_Host,dataLength);
is.close();
// --- CUDA热身
unsigned short * forFirstCudaMalloc; gpuErrchk(cudaMalloc((void **)& forFirstCudaMalloc,dataLength * sizeof(unsigned short)));
gpuErrchk(cudaFree(forFirstCudaMalloc));
// ---分配主机和设备内存空间
unsigned short * Output_Image_Host =(unsigned short *)malloc(dataLength);
unsigned short * Input_Image; gpuErrchk(cudaMalloc((void **)& Input_Image,dataLength * sizeof(unsigned short)));
unsigned short * Output_Image; gpuErrchk(cudaMalloc((void **)& Output_Image,dataLength * sizeof(unsigned short)));
// ---将数据从主机复制到设备
gpuErrchk(cudaMemcpy(Input_Image,Input_Image_Host,dataLength,cudaMemcpyHostToDevice)); //将主机数据复制到设备内存进行过滤
// ---网格和块大小
const dim3 grid(iDivUp(Image_Width,BLOCK_WIDTH),iDivUp(Image_Height,BLOCK_HEIGHT),1);
const dim3 block(BLOCK_WIDTH,BLOCK_HEIGHT,1);
/ **************************** /
/ *原始KERNEL函数* /
/ **************************** /
float time;
cudaEvent_t start,stop;
cudaEventCreate(& start);
cudaEventCreate(& stop);
cudaEventRecord(start,0);
cudaFuncSetCacheConfig(Original_Kernel_Function,cudaFuncCachePreferShared);
Original_Kernel_Function<<<< grid,block>>>(Input_Image,Output_Image,Image_Width,Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(& time,start,stop);
printf(原始内核函数 - 已用时间:%3.3f ms \\\
,时间);
/ ***************************************** ****** /
/ *原始KERNEL功能 - 无共享内存* /
/ ********************* ************************ /
cudaEventRecord(start,0);
cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared,cudaFuncCachePreferL1);
Original_Kernel_Function_no_shared<<<< grid,block>>(Input_Image,Output_Image,Image_Width,Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(& time,start,stop);
printf(原始内核函数 - 无共享 - 已用时间:%3.3f ms \\\
,时间);
/ ***************************************** ***** /
/ * KERNEL优化使用共享内存* /
/ ************************ ********************** /
cudaEventRecord(start,0);
cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared,cudaFuncCachePreferShared);
Optimized_Kernel_Function_shared<<<< grid,block>>>(Input_Image,Output_Image,Image_Width,Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(& time,start,stop);
printf(优化的内核函数 - 共享 - 已用时间:%3.3f ms \\\
,时间);
// ---将结果复制回主机
gpuErrchk(cudaMemcpy(Output_Image_Host,Output_Image,dataLength,cudaMemcpyDeviceToHost));
// ---打开结果文件,写结果并关闭文件
ofstream of2; of2.open(C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw,ios :: binary );
of2.write((char *)Output_Image_Host,dataLength);
of2.close();
cout<< \\\
按任意键退出.. !!;
gpuErrchk(cudaFree(Input_Image));
delete Input_Image_Host;
delete Output_Image_Host;
return 0;
}
这里是 Kepler K20c :
1580 x 1050
Original_Kernel_Function = 1.588ms
Original_Kernel_Function_no_shared = 1.278ms
Optimized_Kernel_Function_shared = 1.455ms
2048 x 2048
Original_Kernel_Function = 3.94ms
Original_Kernel_Function_no_shared = 3.118ms
Optimized_Kernel_Function_shared = 3.709ms
4096 x 4096
Original_Kernel_Function = 16.003ms
Original_Kernel_Function_no_shared = 13.735ms
Optimized_Kernel_Function_shared = 14.526ms
8192 x 8192
Original_Kernel_Function = 62.278ms
Original_Kernel_Function_no_shared = 47.484ms
Optimized_Kernel_Function_shared = 57.474ms
这里是 GT540M ,这更像你的卡:
1580 x 1050
Original_Kernel_Function = 10.332 ms
Original_Kernel_Function_no_shared = 9.294 ms
Optimized_Kernel_Function_shared = 10.301 ms
2048 x 2048
Original_Kernel_Function = 25.256 ms
Original_Kernel_Function_no_shared = 23.567 ms
Optimized_Kernel_Function_shared = 23.876 ms
4096 x 4096
Original_Kernel_Function = 99.791 ms
Original_Kernel_Function_no_shared = 93.919 ms
Optimized_Kernel_Function_shared = 95.464 ms
8192 x 8192
Original_Kernel_Function = 399.259 ms
Original_Kernel_Function_no_shared = 375.634 ms
Optimized_Kernel_Function_shared = 383.121 ms
,在所有情况下,版本不使用共享内存似乎(略)方便。
I have implemented a 2D median filter in CUDA and the whole program is shown below.
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <windows.h>
#include <io.h>
#include <stdio.h>
#include<conio.h>
#include <cstdlib>
#include "cstdlib"
#include <process.h>
#include <stdlib.h>
#include <malloc.h>
#include <ctime>
using namespace std;
#define MEDIAN_DIMENSION 3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9......
#define MEDIAN_LENGTH 9 // Shoul be MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3
#define BLOCK_WIDTH 16 // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
__global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){
__shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH];
int iterator;
const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1;
int StartPoint=MEDIAN_DIMENSION/2;
int EndPoint=StartPoint+1;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid=threadIdx.y*blockDim.y+threadIdx.x;
if(x>=Image_Width || y>=Image_Height)
return;
//Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION
if (x == 0 || x == Image_Width - StartPoint || y == 0
|| y == Image_Height - StartPoint) {
} else {
iterator = 0;
for (int r = x - StartPoint; r < x + (EndPoint); r++) {
for (int c = y - StartPoint; c < y + (EndPoint); c++) {
surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r);
iterator++;
}
}
//Sort the Surround Array to Find Median. Use Bubble Short if Matrix oF 3 x 3 Matrix
//You can use Insertion commented below to Short Bigger Dimension Matrix
//// bubble short //
for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i)
{
// Find position of minimum element
int min=i;
for ( int l=i+1; l<MEDIAN_LENGTH; ++l)
if (surround[tid][l] <surround[tid][min] )
min=l;
// Put found minimum element in its place
unsigned short temp= surround[tid][i];
surround[tid][i]=surround[tid][min];
surround[tid][min]=temp;
}//bubble short end
//////insertion sort start //
/*int t,j,i;
for ( i = 1 ; i< MEDIAN_LENGTH ; i++) {
j = i;
while ( j > 0 && surround[tid][j] < surround[tid][j-1]) {
t= surround[tid][j];
surround[tid][j]= surround[tid][j-1];
surround[tid][j-1] = t;
j--;
}
}*/
////insertion sort end
*(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1]; // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix
__syncthreads();
}
}
int main( int argc, const char** argv )
{
int dataLength;
int p1;
unsigned short* Host_ImageData = NULL;
ifstream is; // Read File
is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary );
// get length of file:
is.seekg (0, ios::end);
dataLength = is.tellg();
is.seekg (0, ios::beg);
Host_ImageData = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
is.read ((char*)Host_ImageData,dataLength);
is.close();
int Image_Width = 1580;
int Image_Height = 1050;
unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength);
unsigned short *Device_ImageData = NULL;
/////////////////////////////
// As First time cudaMalloc take more time for memory alocation, i dont want to cosider this time in my process.
//So Please Ignore Code For Displaying First CudaMelloc Time
clock_t begin = clock();
unsigned short *forFirstCudaMalloc = NULL;
cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) );
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout<<"First CudaMelloc time = "<<elapsed_secs<<" Second\n" ;
cudaFree( forFirstCudaMalloc );
////////////////////////////
//Actual Process Starts From Here
clock_t beginOverAll = clock(); //
cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) );
cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering
int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH));
int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT));
const dim3 grid (x, y, 1);
const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);
begin = clock();
MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height);
cudaDeviceSynchronize();
end = clock();
elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout<<"Process time = "<<elapsed_secs<<" Second\n" ;
cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done
clock_t endOverall = clock();
elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC;
cout<<"Complete Time = "<<elapsed_secs<<" Second\n" ;
ofstream of2; //Write Filtered Image Into File
of2.open("D:\\Filtered_Image.raw", ios::binary);
of2.write((char*)Host_ResultData,dataLength);
of2.close();
cout<<"\nEnd of Writing File. Press Any Key To Exit..!!";
cudaFree(Device_ImageData);
delete Host_ImageData;
delete Host_ResultData;
getch();
return 0;
}
Here is the link for the file I use. I used ImajeJ to store the image in "raw" format and the same for reading the "raw" Image. My image pixel is 16
bit, unsigned short
. The width of the image is 1580
and the height is 1050
.
I strongly believe that the filter can be made more efficient and fast by using proper CUDA optimization.
Indeed, I'm running on a GeForce GT 520M card and the timings are the following
1) For MEDIAN_DIMENSION
of 3 x 3 = 0.027 seconds
2) For MEDIAN_DIMENSION
of 5 x 5 = 0.206 seconds
3) For MEDIAN_DIMENSION
of 7 x 7 = 1.11 seconds
4) For MEDIAN_DIMENSION
of 9 x 9 = 4.931 seconds
As you can see, as we increase MEDIAN_DIMENSION
, the time increases very much and I have applications where I generally use higher MEDIAN_DIMENSION
like 7 x 7
and 9 x 9
. I think that, by using Cuda, even for 9 x 9
the time should be less than 1 second
.
Since I think that the sorting part is taking most of the time here, can we make the sorting part of the algorithm faster?
Can we use grid
and block
more efficiently? Can I use larger BLOCK_WIDTH
and BLOCK_HEIGHT
(like 32
and 32
) and still not hit the maximum __shared__
memory limit which is 4Kb
for my device?
Can __shared__
memory be used more efficiently?
Any help will be appreciated.
Thanks in advance.
I'm answering your last question on the use of shared memory.
As already noticed by Eric, your use of shared memory does not really lead to thread collaboration.
I'm comparing your solution, for the 3x3
case, with a variant of your kernel not using shared memory at all as well as with the Accelereyes solution discussed in 2D median filtering in CUDA: how to efficiently copy global memory to shared memory.
Here is the complete code:
#include <iostream>
#include <fstream>
using namespace std;
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
const int tx_l = threadIdx.x; // --- Local thread x index
const int ty_l = threadIdx.y; // --- Local thread y index
const int tx_g = blockIdx.x * blockDim.x + tx_l; // --- Global thread x index
const int ty_g = blockIdx.y * blockDim.y + ty_l; // --- Global thread y index
__shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];
// --- Fill the shared memory border with zeros
if (tx_l == 0) smem[tx_l] [ty_l+1] = 0; // --- left border
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+1] = 0; // --- right border
if (ty_l == 0) { smem[tx_l+1][ty_l] = 0; // --- upper border
if (tx_l == 0) smem[tx_l] [ty_l] = 0; // --- top-left corner
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l] = 0; // --- top-right corner
} else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2] = 0; // --- bottom border
if (tx_l == 0) smem[tx_l] [ty_l+2] = 0; // --- bottom-left corder
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2] = 0; // --- bottom-right corner
}
// --- Fill shared memory
smem[tx_l+1][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g]; // --- center
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1]; // --- left border
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1]; // --- right border
if ((ty_l == 0)&&(ty_g > 0)) { smem[tx_l+1][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g]; // --- upper border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- top-left corner
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g+1]; // --- top-right corner
} else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) { smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g]; // --- bottom border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- bottom-left corder
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1]; // --- bottom-right corner
}
__syncthreads();
// --- Pull the 3x3 window in a local array
unsigned short v[9] = { smem[tx_l][ty_l], smem[tx_l+1][ty_l], smem[tx_l+2][ty_l],
smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1], smem[tx_l+2][ty_l+1],
smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2], smem[tx_l+2][ty_l+2] };
// --- Bubble-sort
for (int i = 0; i < 5; i++) {
for (int j = i + 1; j < 9; j++) {
if (v[i] > v[j]) { // swap?
unsigned short tmp = v[i];
v[i] = v[j];
v[j] = tmp;
}
}
}
// --- Pick the middle one
Output_Image[ty_g*Image_Width + tx_g] = v[4];
}
/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
__shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
// --- Fill shared memory
iterator = 0;
for (int r = x - 1; r <= x + 1; r++) {
for (int c = y - 1; c <= y + 1; c++) {
surround[tid][iterator] = Input_Image[c*Image_Width+r];
iterator++;
}
}
// --- Sort shared memory to find the median using Bubble Short
for (int i=0; i<5; ++i) {
// --- Find the position of the minimum element
int minval=i;
for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;
// --- Put found minimum element in its place
unsigned short temp = surround[tid][i];
surround[tid][i]=surround[tid][minval];
surround[tid][minval]=temp;
}
// --- Pick the middle one
Output_Image[(y*Image_Width)+x]=surround[tid][4];
__syncthreads();
}
/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
unsigned short surround[9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
// --- Fill array private to the threads
iterator = 0;
for (int r = x - 1; r <= x + 1; r++) {
for (int c = y - 1; c <= y + 1; c++) {
surround[iterator] = Input_Image[c*Image_Width+r];
iterator++;
}
}
// --- Sort private array to find the median using Bubble Short
for (int i=0; i<5; ++i) {
// --- Find the position of the minimum element
int minval=i;
for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;
// --- Put found minimum element in its place
unsigned short temp = surround[i];
surround[i]=surround[minval];
surround[minval]=temp;
}
// --- Pick the middle one
Output_Image[(y*Image_Width)+x]=surround[4];
}
/********/
/* MAIN */
/********/
int main()
{
const int Image_Width = 1580;
const int Image_Height = 1050;
// --- Open data file
ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );
// --- Get file length
is.seekg(0, ios::end);
int dataLength = is.tellg();
is.seekg(0, ios::beg);
// --- Read data from file and close file
unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
is.read((char*)Input_Image_Host,dataLength);
is.close();
// --- CUDA warm up
unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
gpuErrchk(cudaFree(forFirstCudaMalloc));
// --- Allocate host and device memory spaces
unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short)));
unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short)));
// --- Copy data from host to device
gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering
// --- Grid and block sizes
const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);
const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);
/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Original kernel function - elapsed time: %3.3f ms \n", time);
/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Original kernel function - no shared - elapsed time: %3.3f ms \n", time);
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Optimized kernel function - shared - elapsed time: %3.3f ms \n", time);
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));
// --- Open results file, write results and close the file
ofstream of2; of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw", ios::binary);
of2.write((char*)Output_Image_Host, dataLength);
of2.close();
cout << "\n Press Any Key To Exit..!!";
gpuErrchk(cudaFree(Input_Image));
delete Input_Image_Host;
delete Output_Image_Host;
return 0;
}
Here are the timing results on a Kepler K20c:
1580 x 1050
Original_Kernel_Function = 1.588ms
Original_Kernel_Function_no_shared = 1.278ms
Optimized_Kernel_Function_shared = 1.455ms
2048 x 2048
Original_Kernel_Function = 3.94ms
Original_Kernel_Function_no_shared = 3.118ms
Optimized_Kernel_Function_shared = 3.709ms
4096 x 4096
Original_Kernel_Function = 16.003ms
Original_Kernel_Function_no_shared = 13.735ms
Optimized_Kernel_Function_shared = 14.526ms
8192 x 8192
Original_Kernel_Function = 62.278ms
Original_Kernel_Function_no_shared = 47.484ms
Optimized_Kernel_Function_shared = 57.474ms
Here are the timing results on a GT540M, which is more similar to your card:
1580 x 1050
Original_Kernel_Function = 10.332 ms
Original_Kernel_Function_no_shared = 9.294 ms
Optimized_Kernel_Function_shared = 10.301 ms
2048 x 2048
Original_Kernel_Function = 25.256 ms
Original_Kernel_Function_no_shared = 23.567 ms
Optimized_Kernel_Function_shared = 23.876 ms
4096 x 4096
Original_Kernel_Function = 99.791 ms
Original_Kernel_Function_no_shared = 93.919 ms
Optimized_Kernel_Function_shared = 95.464 ms
8192 x 8192
Original_Kernel_Function = 399.259 ms
Original_Kernel_Function_no_shared = 375.634 ms
Optimized_Kernel_Function_shared = 383.121 ms
As it can be seen, the version not using shared memory seems to be (slightly) convenient in all the cases.
这篇关于2D CUDA中值滤波优化的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!