问题描述
以下codeS分别添加使用boost.compute和OpenCL C ++封装两个向量。结果表明boost.compute比的OpenCL C ++包装慢近20倍。我不知道如果我错过使用boost.compute或它确实是缓慢的。
平台:WIN7,vs2013,提高1.55,boost.compute 0.2,ATI的Radeon HD 4600
code采用C ++包装:
的#define __CL_ENABLE_EXCEPTIONS
#包括LT&; CL / cl.hpp>
#包括LT&;升压/定时器/ timer.hpp>
#包括LT&;提升/ smart_ptr / scoped_array.hpp>
#包括LT&;&的fstream GT;
#包括LT&;&数字GT;
#包括LT&;&算法GT;
的#include<功能>诠释主(){
静态字符kernelSource code [] =\\
__kernel无效VADD(__全球为int *一,__global为int * B,__global为int * C){\\
为size_t I = get_global_id(0); \\
\\
C [i] = A [I] + B [I]; \\
} \\
使用类型=提高:: scoped_array< INT取代;
为size_t常量BUFFER_SIZE = 1UL<< 13;
A型(新INT [BUFFER_SIZE]);
B型(新INT [BUFFER_SIZE]);
C型(新INT [BUFFER_SIZE]); 性病::丝毫(A.get(),A.get()+ BUFFER_SIZE,0);
性病::变换(A.get(),A.get()+ BUFFER_SIZE,B.get()的std ::绑定(的std ::乘< INT>()的std ::占位符:: _ 1,2) ); 尝试{
的std ::矢量< CL ::平台> platformList;
//选择平台
CL ::平台::得到(安培; platformList);
//选择第一平台
cl_context_properties cprops [] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platformList [0])(),
0
};
CL ::上下文的背景下(CL_DEVICE_TYPE_GPU,cprops);
//查询集附加到上下文设备
的std ::矢量< CL ::元件>设备= context.getInfo< CL_CONTEXT_DEVICES>();
//创建命令队列
CL :: CommandQueue队列(上下文,设备[0],0);
//创建从源代码程序
CL ::项目::源源(
1,
的std :: make_pair(kernelSource code,0)
);
CL ::计划程序(背景下,资源);
//生成程序
program.build(装置);
// A和主机复制内容创建缓冲区
CL ::缓冲区aBuffer = CL ::缓冲区(
的背景下,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
BUFFER_SIZE *的sizeof(INT)
(无效*)及A [0]);
//创建缓冲液B和复制主机内容
CL ::缓冲区bBuffer = CL ::缓冲区(
的背景下,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
BUFFER_SIZE *的sizeof(INT)
(无效*)及B [0]);
//创建一个使用主机PTR C缓冲区
CL ::缓冲区cBuffer = CL ::缓冲区(
的背景下,
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
BUFFER_SIZE *的sizeof(INT)
(无效*)和C [0]);
//创建内核对象
CL ::内核内核(节目VADD);
//设置内核ARGS
kernel.setArg(0,aBuffer);
kernel.setArg(1,bBuffer);
kernel.setArg(2,cBuffer);
//做的工作
void *的输出;
{
提高::计时器:: auto_cpu_timer定时器;
queue.enqueueNDRangeKernel(
核心,
CL :: NullRange,
CL :: NDRange(BUFFER_SIZE)
CL :: NullRange
);
输出=(INT *)queue.enqueueMapBuffer(
cBuffer,
CL_TRUE,//块
CL_MAP_READ,
0,
BUFFER_SIZE *的sizeof(INT)
);
}
的std :: GPU的ofstream(gpu.txt);
的for(int i = 0; I< BUFFER_SIZE;我++){
GPU<< C [1] - ;&下; ;
}
queue.enqueueUnmapMemObject(
cBuffer,
输出);
}
赶上(CL ::错误常量和放大器; ERR){
的std :: CERR<< err.what()&所述;&下; \\ n;
} 返回EXIT_SUCCESS;
}
code使用boost.compute:
的#include<升压/计算/集装箱/ mapped_view.hpp>
#包括LT&;升压/计算/算法/ transform.hpp>
#包括LT&;升压/计算/功能/ operator.hpp>
#包括LT&;&数字GT;
#包括LT&;&算法GT;
的#include<功能>
#包括LT&;升压/定时器/ timer.hpp>
#包括LT&;提升/ smart_ptr / scoped_array.hpp>
#包括LT&;&的fstream GT;
#包括LT&;升压/元组/ tuple_comparison.hpp> 诠释主(){
为size_t常量BUFFER_SIZE = 1UL<< 13;
提高:: scoped_array< INT> (新INT [BUFFER_SIZE]),B(新INT [BUFFER_SIZE]),C(新INT [BUFFER_SIZE]); 性病::丝毫(A.get(),A.get()+ BUFFER_SIZE,0);
性病::变换(A.get(),A.get()+ BUFFER_SIZE,B.get()的std ::绑定(的std ::乘< INT>()的std ::占位符:: _ 1,2) ); 尝试{
如果(::提高::计算系统:: default_device()。类型()!= CL_DEVICE_TYPE_GPU){
的std :: CERR<< 不是GPU \\ N的;
}
其他{
提高::计算:: command_queue队列=的boost ::计算::系统:: DEFAULT_QUEUE();
提高::计算:: mapped_view<&诠释GT; MA(的static_cast< const int的*>(A.get()),BUFFER_SIZE)
MB(的static_cast< const int的*>(B.get()),BUFFER_SIZE);
提高::计算:: mapped_view<&诠释GT; MC(C.get(),BUFFER_SIZE);
{
提高::计时器:: auto_cpu_timer定时器;
提高::计算::变换(
mA.cbegin(),mA.cend(),
mB.cbegin(),
mC.begin(),
提高::计算::加<&诠释GT;()
队列
);
mC.map(CL_MAP_READ,队列);
}
的std :: GPU的ofstream(gpu.txt);
用于(为size_t我!= 0; i = BUFFER_SIZE; ++ I)GPU<< C [1] - ;&下; ;
mC.unmap(队列);
}
}
赶上(升压::计算:: opencl_error常量和放大器; ERR){
的std :: CERR<< err.what()&所述;&下; \\ n;
} 返回EXIT_SUCCESS;
}
在在Boost.Compute功能应该是几乎等同于你在C使用内核code ++版本的包装(虽然Boost.Compute会做一些展开)。
你看到在定时的差别的原因是,在第一个版本,仅测量花费排队内核和结果映射回主机的时间。在Boost.Compute版本中,你也测量花费的时间创建变换()
内核编译,然后执行它的数量。如果你想要一个更现实的比较,你应该衡量第一个例子中的总执行时间,包括需要建立和编译的OpenCL程序的时间。
这初始化罚款(这是固有的OpenCL的运行时编译模型)是由Boost.Compute在运行时会自动缓存编译内核(也可选程序运行下一次离线缓存他们重用有所减轻)。调用变换()
多次将首次调用之后要快得多。
P.S。你也可以使用核心的包装类在Boost.Compute(如和)随着容器类(如的),仍然经营自己的自定义内核
The following codes add two vectors using boost.compute and opencl c++ wrapper respectively. The result shows boost.compute is almost 20 times slower than the opencl c++ wrapper. I wonder if I miss use boost.compute or it is indeed slow.Platform: win7, vs2013, boost 1.55, boost.compute 0.2, ATI Radeon HD 4600
Code uses the c++ wrapper:
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <boost/timer/timer.hpp>
#include <boost/smart_ptr/scoped_array.hpp>
#include <fstream>
#include <numeric>
#include <algorithm>
#include <functional>
int main(){
static char kernelSourceCode[] = "\
__kernel void vadd(__global int * a, __global int * b, __global int * c){\
size_t i = get_global_id(0);\
\
c[i] = a[i] + b[i];\
}\
";
using type = boost::scoped_array<int>;
size_t const BUFFER_SIZE = 1UL << 13;
type A(new int[BUFFER_SIZE]);
type B(new int[BUFFER_SIZE]);
type C(new int[BUFFER_SIZE]);
std::iota(A.get(), A.get() + BUFFER_SIZE, 0);
std::transform(A.get(), A.get() + BUFFER_SIZE, B.get(), std::bind(std::multiplies<int>(), std::placeholders::_1, 2));
try {
std::vector<cl::Platform> platformList;
// Pick platform
cl::Platform::get(&platformList);
// Pick first platform
cl_context_properties cprops[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platformList[0])(),
0
};
cl::Context context(CL_DEVICE_TYPE_GPU, cprops);
// Query the set of devices attached to the context
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create command-queue
cl::CommandQueue queue(context, devices[0], 0);
// Create the program from source
cl::Program::Sources sources(
1,
std::make_pair(kernelSourceCode, 0)
);
cl::Program program(context, sources);
// Build program
program.build(devices);
// Create buffer for A and copy host contents
cl::Buffer aBuffer = cl::Buffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
BUFFER_SIZE * sizeof(int),
(void *)&A[0]);
// Create buffer for B and copy host contents
cl::Buffer bBuffer = cl::Buffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
BUFFER_SIZE * sizeof(int),
(void *)&B[0]);
// Create buffer that uses the host ptr C
cl::Buffer cBuffer = cl::Buffer(
context,
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
BUFFER_SIZE * sizeof(int),
(void *)&C[0]);
// Create kernel object
cl::Kernel kernel(program, "vadd");
// Set kernel args
kernel.setArg(0, aBuffer);
kernel.setArg(1, bBuffer);
kernel.setArg(2, cBuffer);
// Do the work
void *output;
{
boost::timer::auto_cpu_timer timer;
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(BUFFER_SIZE),
cl::NullRange
);
output = (int *)queue.enqueueMapBuffer(
cBuffer,
CL_TRUE, // block
CL_MAP_READ,
0,
BUFFER_SIZE * sizeof(int)
);
}
std::ofstream gpu("gpu.txt");
for (int i = 0; i < BUFFER_SIZE; i++) {
gpu << C[i] << " ";
}
queue.enqueueUnmapMemObject(
cBuffer,
output);
}
catch (cl::Error const &err) {
std::cerr << err.what() << "\n";
}
return EXIT_SUCCESS;
}
Code uses boost.compute:
#include <boost/compute/container/mapped_view.hpp>
#include <boost/compute/algorithm/transform.hpp>
#include <boost/compute/functional/operator.hpp>
#include <numeric>
#include <algorithm>
#include <functional>
#include <boost/timer/timer.hpp>
#include <boost/smart_ptr/scoped_array.hpp>
#include <fstream>
#include <boost/tuple/tuple_comparison.hpp>
int main(){
size_t const BUFFER_SIZE = 1UL << 13;
boost::scoped_array<int> A(new int[BUFFER_SIZE]), B(new int[BUFFER_SIZE]), C(new int[BUFFER_SIZE]);
std::iota(A.get(), A.get() + BUFFER_SIZE, 0);
std::transform(A.get(), A.get() + BUFFER_SIZE, B.get(), std::bind(std::multiplies<int>(), std::placeholders::_1, 2));
try{
if (boost::compute::system::default_device().type() != CL_DEVICE_TYPE_GPU){
std::cerr << "Not GPU\n";
}
else{
boost::compute::command_queue queue = boost::compute::system::default_queue();
boost::compute::mapped_view<int> mA(static_cast<const int*>(A.get()), BUFFER_SIZE),
mB(static_cast<const int*>(B.get()), BUFFER_SIZE);
boost::compute::mapped_view<int> mC(C.get(), BUFFER_SIZE);
{
boost::timer::auto_cpu_timer timer;
boost::compute::transform(
mA.cbegin(), mA.cend(),
mB.cbegin(),
mC.begin(),
boost::compute::plus<int>(),
queue
);
mC.map(CL_MAP_READ, queue);
}
std::ofstream gpu("gpu.txt");
for (size_t i = 0; i != BUFFER_SIZE; ++i) gpu << C[i] << " ";
mC.unmap(queue);
}
}
catch (boost::compute::opencl_error const &err){
std::cerr << err.what() << "\n";
}
return EXIT_SUCCESS;
}
The kernel code generated by the transform()
function in Boost.Compute should be almost identical to the kernel code you use in the C++ wrapper version (though Boost.Compute will do some unrolling).
The reason you see a difference in timings is that in the first version you are only measuring the time it takes to enqueue the kernel and map the results back to the host. In the Boost.Compute version you are also measuring the amount of time it takes to create the transform()
kernel, compile it, and then execute it. If you want a more realistic comparison you should measure the total execution time for the first example including the time it takes to set up and compile the OpenCL program.
This initialization penalty (which is inherent in OpenCL's run-time compilation model) is somewhat mitigated in Boost.Compute by automatically caching compiled kernels during run-time (and also optionally caching them offline for reuse the next time the program is run). Calling transform()
multiple times will be much faster after the first invocation.
P.S. You can also just use the core wrapper classes in Boost.Compute (like device
and context
) along with the container classes (like vector<T>
) and still run your own custom kernels.
这篇关于性能:boost.compute V.S. OpenCL的C ++包装的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!