2D CUDA中值滤波优化 | CUDA中值滤波优化

本文介绍了2D CUDA中值滤波优化的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我在CUDA中实现了一个二维中值滤波器，整个程序如下所示。

  #includecuda_runtime.h 
 #includecuda_runtime_api.h
 #includedevice_launch_parameters.h
 #include< iostream> 
 #include< fstream> 
 #include< iomanip> 
 #include< windows.h> 
 #include< io.h> 
 #include< stdio.h> 
＃include< conio.h> 
 #include< cstdlib> 
 #includecstdlib
 #include< process.h> 
 #include< stdlib.h> 
 #include< malloc.h> 
 #include< ctime> 
 using namespace std; 
 
 #define MEDIAN_DIMENSION 3 //用于3 x 3的矩阵。我们可以使用5 x 5,7 x 7,9 x 9 ...... 
 #define MEDIAN_LENGTH 9 / / Shoul be MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3 
 
 #define BLOCK_WIDTH 16 //应为8如果矩阵大于5 x 5 elese错误发生，因为在环绕处使用过多的共享数据 [BLOCK_WIDTH * BLOCK_HEIGHT] [MEDIAN_LENGTH] 
 #define BLOCK_HEIGHT 16 //应为8如果矩阵大于5 x 5 elese错误发生，因为环绕声使用过多的共享数据[BLOCK_WIDTH * BLOCK_HEIGHT] MEDIAN_LENGTH] 
 
 __global__ void MedianFilter_gpu（unsigned short * Device_ImageData，int Image_Width，int Image_Height）{
 
 __shared__无符号短环绕声[BLOCK_WIDTH * BLOCK_HEIGHT] [MEDIAN_LENGTH] 
 
 int iterator; 
 const int Half_Of_MEDIAN_LENGTH =（MEDIAN_LENGTH / 2）+1; 
 int StartPoint = MEDIAN_DIMENSION / 2; 
 int EndPoint = StartPoint + 1; 
 
 const int x = blockDim.x * blockIdx.x + threadIdx.x; 
 const int y = blockDim.y * blockIdx.y + threadIdx.y; 
 
 const int tid = threadIdx.y * blockDim.y + threadIdx.x; 
 
 if（x> = Image_Width || y> = Image_Height）
 return; 
 
 //在MEDIAN_DIMENSION x MEDIAN_DIMENSION 
的Matrix Pettern中填充图像的像素值填充环绕if（x == 0 || x == Image_Width  -  StartPoint || y == 0 
 || y == Image_Height  -  StartPoint）{
} else {
 iterator = 0; 
 for（int r = x-StartPoint; r  for（int c = y  -  StartPoint; c  surround [tid] [iterator] = *（Device_ImageData +（c * Image_Width）+ r）; 
 iterator ++; 
} 
} 
 //将Surround数组排序以查找中值。使用泡沫短如果矩阵oF 3 x 3矩阵
 //您可以使用下面注释的插入缩短大尺寸矩阵
 
 // //泡泡短// 
 
 for（int i = 0; i  {
 //找到最小元素的位置
 int min = 
 for（int l = i + 1; l  if（surround [tid] [l]< surround [tid] [min]）
 min = l; 
 //将找到的最小元素放在其位置
 unsigned short temp = surround [tid] [i]; 
 surround [tid] [i] = surround [tid] [min]; 
 surround [tid] [min] = temp; 
} // bubble short end 
 
 //////插入排序开始// 
 
 / * int t，j，i; 
 for（i = 1; i  j = i; 
 while（j> 0&& amp;& chen [tid] [j]< surround [tid] [j-1]）{
 t = surround [tid] [j] 
 surround [tid] [j] = surround [tid] [j-1]; 
 surround [tid] [j-1] = t; 
 j--; 
} 
} * / 
 
 //// insert sort end 
 
 
 
 *（Device_ImageData +（y * Image_Width ）+ x）= surround [tid] [Half_Of_MEDIAN_LENGTH-1]; //如果使用3 x 3矩阵，它将给出surround [tid] [4]的值作为中位数值
 __syncthreads（）; 
} 
} 
 
 int main（int argc，const char ** argv）
 {
 int dataLength; 
 int p1; 
 unsigned short * Host_ImageData = NULL; 
 ifstream is; //读文件
 is.open（D：\\Image_To_Be_Filtered.raw，ios :: binary）; 
 
 //获取文件长度：
 is.seekg（0，ios :: end）; 
 dataLength = is.tellg（）; 
 is.seekg（0，ios :: beg）; 
 
 Host_ImageData = new unsigned short [dataLength * sizeof（char）/ sizeof（unsigned short）]; 
 is.read（（char *）Host_ImageData，dataLength）; 
 is.close（）; 
 
 int Image_Width = 1580; 
 int Image_Height = 1050; 
 
 unsigned short * Host_ResultData =（unsigned short *）malloc（dataLength）; 
 unsigned short * Device_ImageData = NULL; 
 
 /////////////////////////////// 
 //第一次cudaMalloc需要更多的时间对于内存分配，我不想在这个时候我的过程中的cosider。 
 //所以请忽略显示第一个CudaMelloc时间的代码
 clock_t begin = clock（）; 
 unsigned short * forFirstCudaMalloc = NULL; 
 cudaMalloc（（void **）& forFirstCudaMalloc，dataLength * sizeof（unsigned short））; 
 clock_t end = clock（）; 
 double elapsed_secs = double（end  -  begin）/ CLOCKS_PER_SEC; 
 cout<<First CudaMelloc time =<< elapsed_secs<<Second\\\
; 
 cudaFree（forFirstCudaMalloc）; 
 //////////////////////////// 
 
 //实际过程从这里开始
 clock_t beginOverAll = clock（）; // 
 cudaMalloc（（void **）& Device_ImageData，dataLength * sizeof（unsigned short））; 
 cudaMemcpy（Device_ImageData，Host_ImageData，dataLength，cudaMemcpyHostToDevice）; //将主机数据复制到设备内存进行过滤
 
 int x = static_cast< int>（ceilf（static_cast< float>（1580.0） / BLOCK_WIDTH））; 
 int y = static_cast< int>（ceilf（static_cast< float>（1050.0）/ BLOCK_HEIGHT））; 
 
 const dim3 grid（x，y，1）; 
 const dim3 block（BLOCK_WIDTH，BLOCK_HEIGHT，1）; 
 
 begin = clock（）; 
 
 MedianFilter_gpu<<< grid，block>>>（Device_ImageData，Image_Width，Image_Height）; 
 cudaDeviceSynchronize（）; 
 
 end = clock（）; 
 elapsed_secs = double（end  -  begin）/ CLOCKS_PER_SEC; 
 cout<<Process time =<< elapsed_secs<<Second\\\
; 
 
 cudaMemcpy（Host_ResultData，Device_ImageData，dataLength，cudaMemcpyDeviceToHost）; //将设备数据复制到主机内存写入In文件过滤后完成
 
 clock_t endOverall = clock（）; 
 elapsed_secs = double（endOverall  -  beginOverAll）/ CLOCKS_PER_SEC; 
 cout<<Complete Time =<< elapsed_secs<<Second\\\
; 
 
 ofstream of2; //将过滤的图像写入文件
 of2.open（D：\\Filtered_Image.raw，ios :: binary）; 
 of2.write（（char *）Host_ResultData，dataLength）; 
 of2.close（）; 
 cout<\ nEnd of Writing File。按任意键退出.. !!; 
 cudaFree（Device_ImageData）; 
 delete Host_ImageData; 
 delete Host_ResultData; 
 
 getch（）; 
 return 0; 
}

是我使用的文件的链接。我使用以原始格式存储图像，同样用于阅读原始图像。我的图像像素是 16 位，无符号短。图片的宽度为 1580 ，高度为 1050 。

我坚信，通过使用正确的CUDA优化，过滤器可以更高效和更快。

运行在GeForce GT 520M卡上，时间如下

1） MEDIAN_DIMENSION c> 3 x 3 = 0.027秒

2） MEDIAN_DIMENSION c $ c> 5 x 5 = 0.206秒

3） MEDIAN_DIMENSION 7 x 7 = 1.11秒

4） MEDIAN_DIMENSION 9 x 9 = 4.931秒

正如你所看到的， MEDIAN_DIMENSION ，时间增加非常多，我有应用程序，我通常使用更高的 MEDIAN_DIMENSION 像 7 x 7 和 9 x 9 。我认为，通过使用Cuda，即使 9 x 9 ，时间应该小于 1秒。

因为我认为排序部分在这里占用大部分时间，我们可以使算法的排序部分更快吗？

我们可以更有效地使用 grid 和 block 吗？我可以使用更大的 BLOCK_WIDTH 和 BLOCK_HEIGHT （如 32 32 ），仍未达到最大 __共享__ 内存限制为4Kb

可以 __共享__ 更高效地使用内存吗？

 
 
 任何帮助都会感激。
 
 
 提前感谢。 
解决方案
我回答你关于使用共享内存的最后一个问题。
 
 $ b $ 
 
 
 我正在比较你的解决方案，对于 3x3 情况，您的内核的一个变体不使用共享内存，以及在。
 
 
 以下是完整的代码：
  include< iostream> 
 #include< fstream> 
 
 using namespace std; 
 
 #define BLOCK_WIDTH 16 
 #define BLOCK_HEIGHT 16 
 
 / ******************* / 
 / * iDivUp FUNCTION * / 
 / ******************* / 
 int iDivUp（int a，int b）{return （a％b）！= 0）？ （a / b + 1）：（a / b）; } 
 
 / ******************** / 
 / * CUDA错误检查* / 
 / *** ***************** / 
 #define gpuErrchk（ans）{gpuAssert（（ans），__FILE__，__LINE__）; } 
 inline void gpuAssert（cudaError_t code，char * file，int line，bool abort = true）
 {
 if（code！= cudaSuccess）
 {
 fprintf （stderr，GPUassert：％s％s％d\\\
，cudaGetErrorString（code），file，line）; 
 if（abort）exit（code）; 
} 
} 
 
 / ***************************** *************** / 
 / * KERNEL优化使用共享内存* / 
 / ************** ******************************** / 
 __global__ void Optimized_Kernel_Function_shared（unsigned short * Input_Image，unsigned short * Output_Image ，int Image_Width，int Image_Height）
 {
 const int tx_l = threadIdx.x; // --- Local thread x index 
 const int ty_l = threadIdx.y; // --- Local thread y index 
 
 const int tx_g = blockIdx.x * blockDim.x + tx_l; // ---全局线程x索引
 const int ty_g = blockIdx.y * blockDim.y + ty_l; // ---全局线程y索引
 
 __shared__无符号短整型[BLOCK_WIDTH + 2] [BLOCK_HEIGHT + 2]; 
 
 // ---用零填充共享内存边界
 if（tx_l == 0）smem [tx_l] [ty_l + 1] = 0; // --- left border 
 else if（tx_l == BLOCK_WIDTH-1）smem [tx_l + 2] [ty_l + 1] = 0; // --- right border 
 if（ty_l == 0）{smem [tx_l + 1] [ty_l] = 0; // --- upper border 
 if（tx_l == 0）smem [tx_l] [ty_l] = 0; // --- top-left corner 
 else if（tx_l == BLOCK_WIDTH-1）smem [tx_l + 2] [ty_l] = 0; // --- top-right corner 
} else if（ty_l == BLOCK_HEIGHT-1）{smem [tx_l + 1] [ty_l + 2] = 0; // --- bottom border 
 if（tx_l == 0）smem [tx_l] [ty_l + 2] = 0; // --- bottom-left corder 
 else if（tx_l == BLOCK_WIDTH-1）smem [tx_l + 2] [ty_l + 2] = 0; // --- bottom-right corner 
} 
 
 // ---填充共享内存
 smem [tx_l + 1] [ty_l + 1] = Input_Image [ty_g * Image_Width + tx_g]; // --- center 
 if（（tx_l == 0）&&（（tx_g> 0）））smem [tx_l] [ty_l + 1] = Input_Image [ty_g * Image_Width + tx_g-1 ]; // --- left border 
 else if（（tx_l == BLOCK_WIDTH-1）&（tx_g< Image_Width -1））smem [tx_l + 2] [ty_l + 1] = Input_Image [ty_g * Image_Width + tx_g + 1]; // --- right border 
 if（（ty_l == 0）&（ty_g> 0））{smem [tx_l + 1] [ty_l] = Input_Image [（ty_g-1）* Image_Width + tx_g]; // ... upper border 
 if（（tx_l == 0）&（（tx_g> 0）））smem [tx_l] [ty_l] = Input_Image [（ty_g-1）* Image_Width + tx_g-1] // --- top-left corner 
 else if（（tx_l == BLOCK_WIDTH-1）&&（tx_g< Image_Width  -  1））smem [tx_l + 2] [ty_l] = Input_Image [ ty_g-1）* Image_Width + tx_g + 1]; // --- top-right corner 
} else if（（ty_l == BLOCK_HEIGHT-1）&&（ty_g< Image_Height  -  1））{smem [tx_l + 1] [ty_l + 2] = Input_Image [（ty_g + 1）* Image_Width + tx_g]; // --- bottom border 
 if（（tx_l == 0）&&（（tx_g> 0）））smem [tx_l] [ty_l + 2] = Input_Image [（ty_g-1）* Image_Width + tx_g-1]; // --- bottom-left corder 
 else if（（tx_l == BLOCK_WIDTH-1）&&（tx_g< Image_Width -1））smem [tx_l + 2] [ty_l + 2] = Input_Image [（ty_g + 1）* Image_Width + tx_g + 1]; // --- bottom-right corner 
} 
 __syncthreads（）; 
 
 // ---在本地数组中拉3x3窗口
 unsigned short v [9] = {smem [tx_l] [ty_l]，smem [tx_l + 1] [ty_l] ，smem [tx_l + 2] [ty_l]，
 smem [tx_l] [ty_l + 1]，smem [tx_l + 1] [ty_l + 1]，smem [tx_l + 2] [ty_l + 1]，
 smem [tx_l] [ty_l + 2]，smem [tx_l + 1] [ty_l + 2]，smem [tx_l + 2] [ty_l + 2]}; 
 
 // --- Bubble-sort 
 for（int i = 0; i  for（int j = i + 1; j< ; 9; j ++）{
 if（v [i]> v [j]）{// swap？ 
 unsigned short tmp = v [i]; 
 v [i] = v [j]; 
 v [j] = tmp; 
} 
} 
} 
 
 // ---选中间一个
 Output_Image [ty_g * Image_Width + tx_g] = v [4] 
} 
 
 / **************************** / 
 / * ORIGINAL KERNEL FUNCTION * / 
 / **************************** / 
 __global__ void Original_Kernel_Function（unsigned short * Input_Image ，unsigned short * Output_Image，int Image_Width，int Image_Height）{
 
 __shared__ unsigned short surround [BLOCK_WIDTH * BLOCK_HEIGHT] [9]; 
 
 int iterator; 
 
 const int x = blockDim.x * blockIdx.x + threadIdx.x; 
 const int y = blockDim.y * blockIdx.y + threadIdx.y; 
 const int tid = threadIdx.y * blockDim.x + threadIdx.x; 
 
 if（（x> =（Image_Width -1））||（y> = Image_Height_1）||（x == 0）|| ; 
 
 // ---填充共享内存
 iterator = 0; 
 for（int r = x-1; r  for（int c = y-1; c  surround [tid] [iterator] = Input_Image [c * Image_Width + r]; 
 iterator ++; 
} 
} 
 
 // ---对共享内存进行排序以使用Bubble Short 
 for（int i = 0; i  
 // ---找到最小元素的位置
 int minval = i; 
 for（int l = i + 1; l  
 // ---将找到的最小元素放在其位置
 unsigned short temp = surround [tid] [i]; 
 surround [tid] [i] = surround [tid] [minval]; 
 surround [tid] [minval] = temp; 
} 
 
 // ---选择中间一个
 Output_Image [（y * Image_Width）+ x] = surround [tid] [4]; 
 
 __syncthreads（）; 
 
} 
 
 / ******************************** *************** / 
 / *原始KERNEL功能 - 没有共享内存* / 
 / ************** ********************************* / 
 __global__ void Original_Kernel_Function_no_shared（unsigned short * Input_Image，unsigned short * Output_Image，int Image_Width，int Image_Height）{
 
 unsigned short surround [9]; 
 
 int iterator; 
 
 const int x = BlockDim.x * blockIdx.x + threadIdx.x; 
 const int y = blockDim.y * blockIdx.y + threadIdx.y; 
 const int tid = threadIdx.y * blockDim.x + threadIdx.x; 
 
 if（（x> =（Image_Width -1））||（y> = Image_Height_1）||（x == 0）|| ; 
 
 // ---向线程填充数组private 
 iterator = 0; 
 for（int r = x-1; r  for（int c = y-1; c  surround [iterator] = Input_Image [c * Image_Width + r]; 
 iterator ++; 
} 
} 
 
 // ---对私有数组进行排序以使用Bubble Short 
 for（int i = 0; i  
 // ---找到最小元素的位置
 int minval = i; 
 for（int l = i + 1; l  
 // ---将找到的最小元素放在其位置
 unsigned short temp = surround [i]; 
 surround [i] = surround [minval]; 
 surround [minval] = temp; 
} 
 
 // ---选择中间一个
 Output_Image [（y * Image_Width）+ x] = surround [4]; 
 
} 
 
 / ******** / 
 / * MAIN * / 
 / ******** / 
 int main（）
 {
 const int Image_Width = 1580; 
 const int Image_Height = 1050; 
 
 // ---打开数据文件
 ifstream is; is.open（C：\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw，ios :: binary ）; 
 
 // ---获取文件长度
 is.seekg（0，ios :: end）; 
 int dataLength = is.tellg（）; 
 is.seekg（0，ios :: beg）; 
 
 // ---从文件读取数据并关闭文件
 unsigned short * Input_Image_Host = new unsigned short [dataLength * sizeof（char）/ sizeof（unsigned short）]; 
 is.read（（char *）Input_Image_Host，dataLength）; 
 is.close（）; 
 
 // --- CUDA热身
 unsigned short * forFirstCudaMalloc; gpuErrchk（cudaMalloc（（void **）& forFirstCudaMalloc，dataLength * sizeof（unsigned short）））; 
 gpuErrchk（cudaFree（forFirstCudaMalloc））; 
 
 // ---分配主机和设备内存空间
 unsigned short * Output_Image_Host =（unsigned short *）malloc（dataLength）; 
 unsigned short * Input_Image; gpuErrchk（cudaMalloc（（void **）& Input_Image，dataLength * sizeof（unsigned short）））; 
 unsigned short * Output_Image; gpuErrchk（cudaMalloc（（void **）& Output_Image，dataLength * sizeof（unsigned short）））; 
 
 // ---将数据从主机复制到设备
 gpuErrchk（cudaMemcpy（Input_Image，Input_Image_Host，dataLength，cudaMemcpyHostToDevice））; //将主机数据复制到设备内存进行过滤
 
 // ---网格和块大小
 const dim3 grid（iDivUp（Image_Width，BLOCK_WIDTH），iDivUp（Image_Height，BLOCK_HEIGHT），1）; 
 const dim3 block（BLOCK_WIDTH，BLOCK_HEIGHT，1）; 
 
 / **************************** / 
 / *原始KERNEL函数* / 
 / **************************** / 
 float time; 
 cudaEvent_t start，stop; 
 cudaEventCreate（& start）; 
 cudaEventCreate（& stop）; 
 cudaEventRecord（start，0）; 
 
 cudaFuncSetCacheConfig（Original_Kernel_Function，cudaFuncCachePreferShared）; 
 Original_Kernel_Function<<<< grid，block>>>（Input_Image，Output_Image，Image_Width，Image_Height）; 
 gpuErrchk（cudaPeekAtLastError（））; 
 gpuErrchk（cudaDeviceSynchronize（））; 
 
 cudaEventRecord（stop，0）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& time，start，stop）; 
 printf（原始内核函数 - 已用时间：％3.3f ms \\\
，时间）; 
 
 / ***************************************** ****** / 
 / *原始KERNEL功能 - 无共享内存* / 
 / ********************* ************************ / 
 cudaEventRecord（start，0）; 
 
 cudaFuncSetCacheConfig（Original_Kernel_Function_no_shared，cudaFuncCachePreferL1）; 
 Original_Kernel_Function_no_shared<<<< grid，block>>（Input_Image，Output_Image，Image_Width，Image_Height）; 
 gpuErrchk（cudaPeekAtLastError（））; 
 gpuErrchk（cudaDeviceSynchronize（））; 
 
 cudaEventRecord（stop，0）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& time，start，stop）; 
 printf（原始内核函数 - 无共享 - 已用时间：％3.3f ms \\\
，时间）; 
 
 / ***************************************** ***** / 
 / * KERNEL优化使用共享内存* / 
 / ************************ ********************** / 
 cudaEventRecord（start，0）; 
 
 cudaFuncSetCacheConfig（Optimized_Kernel_Function_shared，cudaFuncCachePreferShared）; 
 Optimized_Kernel_Function_shared<<<< grid，block>>>（Input_Image，Output_Image，Image_Width，Image_Height）; 
 gpuErrchk（cudaPeekAtLastError（））; 
 gpuErrchk（cudaDeviceSynchronize（））; 
 
 cudaEventRecord（stop，0）; 
 cudaEventSynchronize（stop）; 
 cudaEventElapsedTime（& time，start，stop）; 
 printf（优化的内核函数 - 共享 - 已用时间：％3.3f ms \\\
，时间）; 
 
 // ---将结果复制回主机
 gpuErrchk（cudaMemcpy（Output_Image_Host，Output_Image，dataLength，cudaMemcpyDeviceToHost））; 
 
 // ---打开结果文件，写结果并关闭文件
 ofstream of2; of2.open（C：\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw，ios :: binary ）; 
 of2.write（（char *）Output_Image_Host，dataLength）; 
 of2.close（）; 
 
 cout<< \\\
按任意键退出.. !!; 
 gpuErrchk（cudaFree（Input_Image））; 
 
 delete Input_Image_Host; 
 delete Output_Image_Host; 
 
 return 0; 
} 
  
这里是 Kepler K20c  ：
  1580 x 1050 
 Original_Kernel_Function = 1.588ms 
 Original_Kernel_Function_no_shared = 1.278ms 
 Optimized_Kernel_Function_shared = 1.455ms 
 
 2048 x 2048 
 Original_Kernel_Function = 3.94ms 
 Original_Kernel_Function_no_shared = 3.118ms 
 Optimized_Kernel_Function_shared = 3.709ms 
 
 4096 x 4096 
 Original_Kernel_Function = 16.003ms 
 Original_Kernel_Function_no_shared = 13.735ms 
 Optimized_Kernel_Function_shared = 14.526ms 
 
 8192 x 8192 
 Original_Kernel_Function = 62.278ms 
 Original_Kernel_Function_no_shared = 47.484ms 
 Optimized_Kernel_Function_shared = 57.474ms 
  
这里是 GT540M ，这更像你的卡：
  1580 x 1050 
 Original_Kernel_Function = 10.332 ms 
 Original_Kernel_Function_no_shared = 9.294 ms 
 Optimized_Kernel_Function_shared = 10.301 ms 
 
 2048 x 2048 
 Original_Kernel_Function = 25.256 ms 
 Original_Kernel_Function_no_shared = 23.567 ms 
 Optimized_Kernel_Function_shared = 23.876 ms 
 
 4096 x 4096 
 Original_Kernel_Function = 99.791 ms 
 Original_Kernel_Function_no_shared = 93.919 ms 
 Optimized_Kernel_Function_shared = 95.464 ms 
 
 8192 x 8192 
 Original_Kernel_Function = 399.259 ms 
 Original_Kernel_Function_no_shared = 375.634 ms 
 Optimized_Kernel_Function_shared = 383.121 ms 
  
 ，在所有情况下，版本不使用共享内存似乎（略）方便。
 
I have implemented a 2D median filter in CUDA and the whole program is shown below.
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <windows.h>
#include <io.h>
#include <stdio.h>
#include<conio.h>
#include <cstdlib>
#include "cstdlib"
#include <process.h>
#include <stdlib.h>
#include <malloc.h>
#include <ctime>
using namespace std;

#define MEDIAN_DIMENSION  3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9......
#define MEDIAN_LENGTH 9   // Shoul be  MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3

#define BLOCK_WIDTH 16  // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]

 __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){

      __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH];

    int iterator;
    const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1;
    int StartPoint=MEDIAN_DIMENSION/2;
    int EndPoint=StartPoint+1;

    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;

    const int tid=threadIdx.y*blockDim.y+threadIdx.x;

      if(x>=Image_Width || y>=Image_Height)
        return;

     //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION
            if (x == 0 || x == Image_Width - StartPoint || y == 0
                || y == Image_Height - StartPoint) {
            } else {
                iterator = 0;
                for (int r = x - StartPoint; r < x + (EndPoint); r++) {
                    for (int c = y - StartPoint; c < y + (EndPoint); c++) {
                        surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r);
                        iterator++;
                    }
                }
//Sort the Surround Array to Find Median. Use Bubble Short  if Matrix oF 3 x 3 Matrix
                    //You can use Insertion commented below to Short Bigger Dimension Matrix

                              ////      bubble short //

                    for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i)
                    {
                        // Find position of minimum element
                        int min=i;
                        for ( int l=i+1; l<MEDIAN_LENGTH; ++l)
                            if (surround[tid][l] <surround[tid][min] )
                                min=l;
                        // Put found minimum element in its place
                        unsigned short  temp= surround[tid][i];
                        surround[tid][i]=surround[tid][min];
                        surround[tid][min]=temp;
                    }//bubble short  end

                    //////insertion sort start   //

                    /*int t,j,i;
                    for ( i = 1 ; i< MEDIAN_LENGTH ; i++) {
                        j = i;
                        while ( j > 0 && surround[tid][j] < surround[tid][j-1]) {
                            t= surround[tid][j];
                            surround[tid][j]= surround[tid][j-1];
                            surround[tid][j-1] = t;
                            j--;
                        }
                    }*/

                    ////insertion sort end



                    *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1];   // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix
                        __syncthreads();
            }
}

  int main( int argc, const char** argv )
{
    int dataLength;
    int p1;
    unsigned short* Host_ImageData = NULL;
    ifstream is; // Read File
    is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary );

    // get length of file:
    is.seekg (0, ios::end);
    dataLength = is.tellg();
    is.seekg (0, ios::beg);

    Host_ImageData = new  unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read ((char*)Host_ImageData,dataLength);
    is.close();

    int Image_Width = 1580;
    int Image_Height = 1050;

    unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength);
    unsigned short *Device_ImageData = NULL;

    /////////////////////////////
    // As First time cudaMalloc take more time  for memory alocation, i dont want to cosider this time in my process.
    //So Please Ignore Code For Displaying First CudaMelloc Time
    clock_t begin = clock();
    unsigned short *forFirstCudaMalloc = NULL;
    cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) );
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"First CudaMelloc time = "<<elapsed_secs<<"  Second\n" ;
    cudaFree( forFirstCudaMalloc );
    ////////////////////////////

    //Actual Process Starts From Here
    clock_t beginOverAll = clock();   //
    cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) );
    cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering

    int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH));
    int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT));

    const dim3 grid (x, y, 1);
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);

    begin = clock();

    MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height);
    cudaDeviceSynchronize();

    end = clock();
    elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"Process time = "<<elapsed_secs<<"  Second\n" ;

    cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done

    clock_t endOverall = clock();
    elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC;
    cout<<"Complete Time  = "<<elapsed_secs<<"  Second\n" ;

    ofstream of2;   //Write Filtered Image Into File
    of2.open("D:\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Host_ResultData,dataLength);
    of2.close();
    cout<<"\nEnd of Writing File.  Press Any Key To Exit..!!";
    cudaFree(Device_ImageData);
    delete Host_ImageData;
    delete Host_ResultData;

    getch();
    return 0;
}
Here is the link for the file I use. I used ImajeJ to store the image in "raw" format and the same for reading the "raw" Image. My image pixel is 16 bit, unsigned short. The width of the image is 1580 and the height is 1050.
I strongly believe that the filter can be made more efficient and fast by using proper CUDA optimization.
Indeed, I'm running on a GeForce GT 520M card and the timings are the following 
1) For MEDIAN_DIMENSION of 3 x 3 = 0.027 seconds
2) For MEDIAN_DIMENSION of 5 x 5 = 0.206 seconds
3) For MEDIAN_DIMENSION of 7 x 7 = 1.11  seconds
4) For MEDIAN_DIMENSION of 9 x 9 = 4.931 seconds
As you can see, as we increase MEDIAN_DIMENSION, the time increases very much and I have applications where I generally use higher MEDIAN_DIMENSION like 7 x 7 and 9 x 9. I think that, by using Cuda, even for 9 x 9 the time should be less than 1 second.  
Since I think that the sorting part is taking most of the time here, can we make the sorting part of the algorithm faster? 
Can we use grid and block more efficiently? Can I use larger BLOCK_WIDTH and BLOCK_HEIGHT (like 32 and 32) and still not hit the maximum __shared__ memory limit which is 4Kb for my device?
Can __shared__ memory be used more efficiently?
Any help will be appreciated.
Thanks in advance. 
 解决方案 
I'm answering your last question on the use of shared memory.
As already noticed by Eric, your use of shared memory does not really lead to thread collaboration.
I'm comparing your solution, for the 3x3 case, with a variant of your kernel not using shared memory at all as well as with the Accelereyes solution discussed in 2D median filtering in CUDA: how to efficiently copy global memory to shared memory.
Here is the complete code:
#include <iostream>
#include <fstream>

using namespace std;

#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
    const int tx_l = threadIdx.x;                           // --- Local thread x index
    const int ty_l = threadIdx.y;                           // --- Local thread y index

    const int tx_g = blockIdx.x * blockDim.x + tx_l;        // --- Global thread x index
    const int ty_g = blockIdx.y * blockDim.y + ty_l;        // --- Global thread y index

    __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];

    // --- Fill the shared memory border with zeros
    if (tx_l == 0)                      smem[tx_l]  [ty_l+1]    = 0;    // --- left border
    else if (tx_l == BLOCK_WIDTH-1)     smem[tx_l+2][ty_l+1]    = 0;    // --- right border
    if (ty_l == 0) {                    smem[tx_l+1][ty_l]      = 0;    // --- upper border
        if (tx_l == 0)                  smem[tx_l]  [ty_l]      = 0;    // --- top-left corner
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]      = 0;    // --- top-right corner
        }   else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2]  = 0;    // --- bottom border
        if (tx_l == 0)                  smem[tx_l]  [ty_l+2]    = 0;    // --- bottom-left corder
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2]    = 0;    // --- bottom-right corner
    }

    // --- Fill shared memory
                                                                    smem[tx_l+1][ty_l+1] =                           Input_Image[ty_g*Image_Width + tx_g];      // --- center
    if ((tx_l == 0)&&((tx_g > 0)))                                      smem[tx_l]  [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];      // --- left border
    else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))         smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];      // --- right border
    if ((ty_l == 0)&&(ty_g > 0)) {                                      smem[tx_l+1][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g];    // --- upper border
            if ((tx_l == 0)&&((tx_g > 0)))                                  smem[tx_l]  [ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- top-left corner
            else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g+1];  // --- top-right corner
         } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) {  smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g];    // --- bottom border
         if ((tx_l == 0)&&((tx_g > 0)))                                 smem[tx_l]  [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- bottom-left corder
        else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1];  // --- bottom-right corner
    }
    __syncthreads();

    // --- Pull the 3x3 window in a local array
    unsigned short v[9] = { smem[tx_l][ty_l],   smem[tx_l+1][ty_l],     smem[tx_l+2][ty_l],
                            smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1],   smem[tx_l+2][ty_l+1],
                            smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2],   smem[tx_l+2][ty_l+2] };

    // --- Bubble-sort
    for (int i = 0; i < 5; i++) {
        for (int j = i + 1; j < 9; j++) {
            if (v[i] > v[j]) { // swap?
                unsigned short tmp = v[i];
                v[i] = v[j];
                v[j] = tmp;
            }
         }
    }

    // --- Pick the middle one
    Output_Image[ty_g*Image_Width + tx_g] = v[4];
}

/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill shared memory
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[tid][iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort shared memory to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[tid][i];
        surround[tid][i]=surround[tid][minval];
        surround[tid][minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[tid][4];

    __syncthreads();

}

/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    unsigned short surround[9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill array private to the threads
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort private array to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[i];
        surround[i]=surround[minval];
        surround[minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[4];

}

/********/
/* MAIN */
/********/
int main()
{
    const int Image_Width = 1580;
    const int Image_Height = 1050;

    // --- Open data file
    ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );

    // --- Get file length
    is.seekg(0, ios::end);
    int dataLength = is.tellg();
    is.seekg(0, ios::beg);

    // --- Read data from file and close file
    unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read((char*)Input_Image_Host,dataLength);
    is.close();

    // --- CUDA warm up
    unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
    gpuErrchk(cudaFree(forFirstCudaMalloc));

    // --- Allocate host and device memory spaces
    unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
    unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short)));
    unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short)));

    // --- Copy data from host to device
    gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering

    // --- Grid and block sizes
    const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);

    /****************************/
    /* ORIGINAL KERNEL FUNCTION */
    /****************************/
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
    Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - elapsed time:  %3.3f ms \n", time);

    /***********************************************/
    /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
    /***********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
    Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - no shared - elapsed time:  %3.3f ms \n", time);

    /**********************************************/
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
    /**********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
    Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Optimized kernel function - shared - elapsed time:  %3.3f ms \n", time);

    // --- Copy results back to the host
    gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));

    // --- Open results file, write results and close the file
    ofstream of2;     of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Output_Image_Host, dataLength);
    of2.close();

    cout << "\n Press Any Key To Exit..!!";
    gpuErrchk(cudaFree(Input_Image));

    delete Input_Image_Host;
    delete Output_Image_Host;

    return 0;
}
Here are the timing results on a Kepler K20c:
1580 x 1050
Original_Kernel_Function             = 1.588ms
Original_Kernel_Function_no_shared   = 1.278ms
Optimized_Kernel_Function_shared     = 1.455ms

2048 x 2048
Original_Kernel_Function             = 3.94ms
Original_Kernel_Function_no_shared   = 3.118ms
Optimized_Kernel_Function_shared     = 3.709ms

4096 x 4096
Original_Kernel_Function             = 16.003ms
Original_Kernel_Function_no_shared   = 13.735ms
Optimized_Kernel_Function_shared     = 14.526ms

8192 x 8192
Original_Kernel_Function             = 62.278ms
Original_Kernel_Function_no_shared   = 47.484ms
Optimized_Kernel_Function_shared     = 57.474ms
Here are the timing results on a GT540M, which is more similar to your card:
1580 x 1050
Original_Kernel_Function             = 10.332 ms
Original_Kernel_Function_no_shared   =  9.294 ms
Optimized_Kernel_Function_shared     = 10.301 ms

2048 x 2048
Original_Kernel_Function             = 25.256 ms
Original_Kernel_Function_no_shared   = 23.567 ms
Optimized_Kernel_Function_shared     = 23.876 ms

4096 x 4096
Original_Kernel_Function             = 99.791 ms
Original_Kernel_Function_no_shared   = 93.919 ms
Optimized_Kernel_Function_shared     = 95.464 ms

8192 x 8192
Original_Kernel_Function             = 399.259 ms
Original_Kernel_Function_no_shared   = 375.634 ms
Optimized_Kernel_Function_shared     = 383.121 ms
As it can be seen, the version not using shared memory seems to be (slightly) convenient in all the cases.
                        这篇关于2D CUDA中值滤波优化的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！