问题描述
我想在CUDA中实现FIR(有限脉冲响应)滤波器。我的方法很简单,看起来有点像这样:
#include< cuda.h>
__global__ void filterData(const float * d_data,
const float * d_numerator,
float * d_filteredData,
const int numeratorLength,
const int filteredDataLength)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
float sum = 0.0f;
if(i< filteredDataLength)
{
for(int j = 0; j< numeratorLength; j ++)
{
//第一(分子长度-1)元素包含过滤器状态
sum + = d_numerator [j] * d_data [i + numeratorLength-j-1];
}
}
d_filteredData [i] = sum;
}
int main(void)
{
//(跳过错误检查以使代码更易读)
int dataLength = 18042;
int filteredDataLength = 16384;
int numeratorLength = 1659;
//指向数据,过滤数据和过滤系数的指针
//(跳过如何读入数组)
float * h_data = new float [dataLength];
float * h_filteredData = new float [filteredDataLength];
float * h_filter = new float [numeratorLength];
//创建设备指针
float * d_data = nullptr;
cudaMalloc((void **)& d_data,dataLength * sizeof(float));
float * d_numerator = nullptr;
cudaMalloc((void **)& d_numerator,numeratorLength * sizeof(float));
float * d_filteredData = nullptr;
cudaMalloc((void **)& d_filteredData,filteredDataLength * sizeof(float));
//将数据复制到设备
cudaMemcpy(d_data,h_data,dataLength * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_numerator,h_numerator,numeratorLength * sizeof(float),cudaMemcpyHostToDevice);
//启动内核
int threadsPerBlock = 256;
int blocksPerGrid =(filteredDataLength + threadsPerBlock - 1)/ threadsPerBlock;
filterData<<<<块blockPerGrid,threadsPerBlock>>>(d_data,d_numerator,d_filteredData,numeratorLength,filteredDataLength);
//将结果复制到主机
cudaMemcpy(h_filteredData,d_filteredData,filteredDataLength * sizeof(float),cudaMemcpyDeviceToHost);
//清理
cudaFree(d_data);
cudaFree(d_numerator);
cudaFree(d_filteredData);
//使用h_filteredData ...执行操作...
//清理一些更多
delete [] h_data;
delete [] h_filteredData;
delete [] h_filter;
}
过滤器工作,但是由于我刚接触CUDA编程, m不知道如何优化它。
我看到的一个小问题是 dataLength
, filteredDataLength
和 numeratorLength
在应用程序中我并不知道我打算使用过滤器。另外,即使 dataLength
是上面代码中 32
的倍数,不能保证在最终应用程序中。
当我将上面的代码与ArrayFire进行比较时,我的代码需要花费大约三倍的时间来执行。
有没有人对如何加快速度有任何想法? p>
编辑:已将所有 filterLength 更改为 numeratorLength
p>
您正在尝试通过直接评估通过CUDA内核的1D卷积来计算滤波器输出。
在滤波器脉冲响应持续时间为长的情况下,可以做一件事来评估滤波后的输入,使用FFT的共轭域。下面我使用CUDA Thrust和cuFFT库报告示例代码。它是在
让我否认使用此代码可能有一些优化,但我宁愿离开因为它是,因此它可以更容易地与其Matlab的对应比较。
#include< stdio.h>
#include< math.h>
#include< cufft.h>
#include< thrust\device_vector.h>
#include< thrust \sequence.h>
#define pi_f 3.14159265358979f //单精度的希腊字符
/ **************** /
/ * SIN OPERATOR * /
/ **************** /
class sin_op {
float fk_,Fs_;
public:
sin_op(float fk,float Fs){fk_ = fk; Fs_ = Fs; }
__host__ __device__ float operator()(float x)const {return sin(2.f * pi_f * x * fk_ / Fs_); }
};
/ ***************** /
/ * SINC操作员* /
/ ******** ********* /
class sinc_op {
float fc_,Fs_;
public:
sinc_op(float fc,float Fs){fc_ = fc; Fs_ = Fs; }
__host__ __device__ float operator()(float x)const
{
if(x == 0)return(2.f * fc_ / Fs_);
else return(2.f * fc_ / Fs _)* sin(2.f * pi_f * fc_ * x / Fs _)/(2.f * pi_f * fc_ * x / Fs_)
}
};
/ ******************** /
/ * HAMMING OPERATOR * /
/ ***** *************** /
class hamming_op {
int L_;
public:
hamming_op(int L){L_ = L; }
__host__ __device__ float operator()(int x)const
{
return 0.54-0.46 * cos(2.f * pi_f * x /(L_-1)) ;
}
};
/ ********************************* /
/ * MULTIPLY CUFFTCOMPLEX NUMBERS * /
/ ********************************* /
struct multiply_cufftComplex {
__device__ cufftComplex operator()(const cufftComplex& a,const cufftComplex& b)const {
cufftComplex r;
r.x = a.x * b.x - a.y * b.y;
r.y = a.x * b.y + a.y * b.x;
return r;
}
};
/ ******** /
/ * MAIN * /
/ ******** /
void main(){
//信号参数:
int M = 256; //信号长度
const int N = 4;
float f [N] = {440,880,1000,2000}; // frequencies
float Fs = 5000 .; // sampling rate
//通过加上sinusoids生成一个信号:
thrust :: device_vector< float> d_x(M,0.f); // pre-allocate'accumulator'
thrust :: device_vector< float> d_n(M); //离散时间网格
thrust :: sequence(d_n.begin(),d_n.end(),0,1);
thrust :: device_vector< float> d_temp(M);
for(int i = 0; i float fk = f [i];
thrust :: transform(d_n.begin(),d_n.end(),d_temp.begin(),sin_op(fk,Fs));
thrust :: transform(d_temp.begin(),d_temp.end(),d_x.begin(),d_x.begin(),thrust :: plus< float>());
}
//过滤参数:
int L = 257; // filter length
float fc = 600.f; // cutoff frequency
//使用窗口方法设计过滤器:
thrust :: device_vector< float> d_hsupp(L);
thrust :: sequence(d_hsupp.begin(),d_hsupp.end(), - (L-1)/ 2,1);
thrust :: device_vector< float> d_hideal(L);
thrust :: transform(d_hsupp.begin(),d_hsupp.end(),d_hideal.begin(),sinc_op(fc,Fs));
thrust :: device_vector< float> d_l(L);
thrust :: sequence(d_l.begin(),d_l.end(),0,1);
thrust :: device_vector< float> d_h(L);
thrust :: transform(d_l.begin(),d_l.end(),d_h.begin(),hamming_op(L));
// h是我们的过滤器
thrust :: transform(d_hideal.begin(),d_hideal.end(),d_h.begin(),d_h.begin(),thrust :: multiplies& ());
// ---选择大于L + M-1的下一个幂2
int Nfft = pow(2,(ceil(log2((float)(L + M- 1))))); //或2 ^ nextpow2(L + M-1)
//零填充信号和脉冲响应:
thrust :: device_vector< float> d_xzp(Nfft,0.f);
thrust :: device_vector< float> d_hzp(Nfft,0.f);
thrust :: copy(d_x.begin(),d_x.end(),d_xzp.begin());
thrust :: copy(d_h.begin(),d_h.end(),d_hzp.begin());
//转换信号和过滤器:
cufftHandle plan;
cufftPlan1d(& plan,Nfft,CUFFT_R2C,1);
thrust :: device_vector< cufftComplex> d_X(Nfft / 2 + 1);
thrust :: device_vector< cufftComplex> d_H(Nfft / 2 + 1);
cufftExecR2C(plan,(cufftReal *)thrust :: raw_pointer_cast(d_xzp.data()),(cufftComplex *)thrust :: raw_pointer_cast(d_X.data()));
cufftExecR2C(plan,(cufftReal *)thrust :: raw_pointer_cast(d_hzp.data()),(cufftComplex *)thrust :: raw_pointer_cast(d_H.data()));
thrust :: device_vector< cufftComplex> d_Y(Nfft / 2 + 1);
thrust :: transform(d_X.begin(),d_X.end(),d_H.begin(),d_Y.begin(),multiply_cufftComplex());
cufftPlan1d(& plan,Nfft,CUFFT_C2R,1);
thrust :: device_vector< float> d_y(Nfft);
cufftExecC2R(plan,(cufftComplex *)thrust :: raw_pointer_cast(d_Y.data()),(cufftReal *)thrust :: raw_pointer_cast(d_y.data()));
getchar();
}
I'm trying to implement a FIR (Finite Impulse Response) filter in CUDA. My approach is quite simple and looks somewhat like this:
#include <cuda.h>
__global__ void filterData(const float *d_data,
const float *d_numerator,
float *d_filteredData,
const int numeratorLength,
const int filteredDataLength)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
float sum = 0.0f;
if (i < filteredDataLength)
{
for (int j = 0; j < numeratorLength; j++)
{
// The first (numeratorLength-1) elements contain the filter state
sum += d_numerator[j] * d_data[i + numeratorLength - j - 1];
}
}
d_filteredData[i] = sum;
}
int main(void)
{
// (Skipping error checks to make code more readable)
int dataLength = 18042;
int filteredDataLength = 16384;
int numeratorLength= 1659;
// Pointers to data, filtered data and filter coefficients
// (Skipping how these are read into the arrays)
float *h_data = new float[dataLength];
float *h_filteredData = new float[filteredDataLength];
float *h_filter = new float[numeratorLength];
// Create device pointers
float *d_data = nullptr;
cudaMalloc((void **)&d_data, dataLength * sizeof(float));
float *d_numerator = nullptr;
cudaMalloc((void **)&d_numerator, numeratorLength * sizeof(float));
float *d_filteredData = nullptr;
cudaMalloc((void **)&d_filteredData, filteredDataLength * sizeof(float));
// Copy data to device
cudaMemcpy(d_data, h_data, dataLength * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_numerator, h_numerator, numeratorLength * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel
int threadsPerBlock = 256;
int blocksPerGrid = (filteredDataLength + threadsPerBlock - 1) / threadsPerBlock;
filterData<<<blocksPerGrid,threadsPerBlock>>>(d_data, d_numerator, d_filteredData, numeratorLength, filteredDataLength);
// Copy results to host
cudaMemcpy(h_filteredData, d_filteredData, filteredDataLength * sizeof(float), cudaMemcpyDeviceToHost);
// Clean up
cudaFree(d_data);
cudaFree(d_numerator);
cudaFree(d_filteredData);
// Do stuff with h_filteredData...
// Clean up some more
delete [] h_data;
delete [] h_filteredData;
delete [] h_filter;
}
The filter works, but as I'm new to CUDA programming and I'm not sure how to optimize it.
A slight problem that I see is that dataLength
, filteredDataLength
, and numeratorLength
are not known before hand in the application I intend to use the filter in. Also, even though dataLength
is a multiple of 32
in the above code, it is not guaranteed to be that in the final application.
When I compare my code above to ArrayFire, my code takes about three times longer to execute.
Does anyone have any ideas on how to speed things up?
EDIT: Have changed all filterLength
to numeratorLength
.
You are attempting at calculating the filter output by directly evaluating the 1D convolution through a CUDA kernel.
In the case when the filter impulse response duration is long, one thing you can do to evaluate the filtered input is performing the calculations directly in the conjugate domain using FFTs. Below I'm reporting a sample code using CUDA Thrust and the cuFFT library. It is a direct translation of the Matlab-based example reported at
Low-Pass Filtering by FFT Convolution
Let me disclaim that some optimizations are possible with this code, but I preferred to leave it as it is so that it could be more easily compared to its Matlab's counterpart.
#include <stdio.h>
#include <math.h>
#include <cufft.h>
#include <thrust\device_vector.h>
#include <thrust\sequence.h>
#define pi_f 3.14159265358979f // Greek pi in single precision
/****************/
/* SIN OPERATOR */
/****************/
class sin_op {
float fk_, Fs_;
public:
sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; }
__host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); }
};
/*****************/
/* SINC OPERATOR */
/*****************/
class sinc_op {
float fc_, Fs_;
public:
sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; }
__host__ __device__ float operator()(float x) const
{
if (x==0) return (2.f*fc_/Fs_);
else return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_);
}
};
/********************/
/* HAMMING OPERATOR */
/********************/
class hamming_op {
int L_;
public:
hamming_op(int L) { L_ = L; }
__host__ __device__ float operator()(int x) const
{
return 0.54-0.46*cos(2.f*pi_f*x/(L_-1));
}
};
/*********************************/
/* MULTIPLY CUFFTCOMPLEX NUMBERS */
/*********************************/
struct multiply_cufftComplex {
__device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const {
cufftComplex r;
r.x = a.x * b.x - a.y * b.y;
r.y = a.x * b.y + a.y * b.x;
return r;
}
};
/********/
/* MAIN */
/********/
void main(){
// Signal parameters:
int M = 256; // signal length
const int N = 4;
float f[N] = { 440, 880, 1000, 2000 }; // frequencies
float Fs = 5000.; // sampling rate
// Generate a signal by adding up sinusoids:
thrust::device_vector<float> d_x(M,0.f); // pre-allocate 'accumulator'
thrust::device_vector<float> d_n(M); // discrete-time grid
thrust::sequence(d_n.begin(), d_n.end(), 0, 1);
thrust::device_vector<float> d_temp(M);
for (int i=0; i<N; i++) {
float fk = f[i];
thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs));
thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>());
}
// Filter parameters:
int L = 257; // filter length
float fc = 600.f; // cutoff frequency
// Design the filter using the window method:
thrust::device_vector<float> d_hsupp(L);
thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1);
thrust::device_vector<float> d_hideal(L);
thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs));
thrust::device_vector<float> d_l(L);
thrust::sequence(d_l.begin(), d_l.end(), 0, 1);
thrust::device_vector<float> d_h(L);
thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L));
// h is our filter
thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>());
// --- Choose the next power of 2 greater than L+M-1
int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1)
// Zero pad the signal and impulse response:
thrust::device_vector<float> d_xzp(Nfft,0.f);
thrust::device_vector<float> d_hzp(Nfft,0.f);
thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin());
thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin());
// Transform the signal and the filter:
cufftHandle plan;
cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1);
thrust::device_vector<cufftComplex> d_X(Nfft/2+1);
thrust::device_vector<cufftComplex> d_H(Nfft/2+1);
cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data()));
cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data()));
thrust::device_vector<cufftComplex> d_Y(Nfft/2+1);
thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex());
cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1);
thrust::device_vector<float> d_y(Nfft);
cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data()));
getchar();
}
这篇关于CUDA中的FIR滤波器(作为1D卷积)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!