问题描述
我喜欢D的某些功能,但是如果他们有一个
运行时代码会感兴趣?
为了比较,我实现了一个简单的程序在C ++和D中计算许多短向量的标量积。结果是令人惊讶的:
- D:18.9 s [ runtime]
- C ++:3.8 s
C ++的速度几乎是游戏的五倍我在D
程序中犯了一个错误?
我用g ++ -O3(gcc-snapshot 2011-02-19)编译C ++,用dmd编译D: O(dmd 2.052)在最近的linux桌面上。
这里是C ++程序:
#include< iostream>
#include< random>
#include< chrono>
#include< string>
#include< vector>
#include< array>
typedef std :: chrono :: duration< long,std :: ratio< 1,1000>毫秒;
template< typename _T>
long time_since(std :: chrono :: time_point< _T>& time){
long tm = std :: chrono :: duration_cast< millisecs>(std :: chrono :: system_clock :: now () - time).count();
time = std :: chrono :: system_clock :: now();
return tm;
}
const long N = 20000;
const int size = 10;
typedef int value_type;
typedef long long result_type;
typedef std :: vector< value_type> vector_t;
typedef typename vector_t :: size_type size_type;
inline value_type scalar_product(const vector_t& x,const vector_t& y){
value_type res = 0;
size_type siz = x.size();
for(size_type i = 0; i res + = x [i] * y [i];
return res;
}
int main(){
auto tm_before = std :: chrono :: system_clock :: now();
// 1.随机分配并填充许多短向量
vector_t * xs = new vector_t [N];
for(int i = 0; i xs [i] = vector_t(size);
}
std :: cerr<< allocation:< time_since(tm_before)<< ms< std :: endl;
std :: mt19937 rnd_engine;
std :: uniform_int_distribution< value_type> runif_gen(-1000,1000);
for(int i = 0; i for(int j = 0; j xs [i] [j ] = runif_gen(rnd_engine);
std :: cerr<< 随机生成:< time_since(tm_before)<< ms< std :: endl;
// 2.计算所有成对标量积:
time_since(tm_before);
result_type avg = 0;
for(int i = 0; i for(int j = 0; j avg + = scalar_product [i],xs [j]);
avg = avg / N * N;
auto time = time_since(tm_before);
std :: cout<< result:< avg<< std :: endl;
std :: cout<< time:<<时间< ms< std :: endl;
}
此处的D版本:
import std.stdio;
import std.datetime;
import std.random;
const long N = 20000;
const int size = 10;
别名int value_type;
alias long result_type;
alias value_type [] vector_t;
alias uint size_type;
value_type scalar_product(const ref vector_t x,const ref vector_t y){
value_type res = 0;
size_type siz = x.length;
for(size_type i = 0; i res + = x [i] * y [i];
return res;
}
int main(){
auto tm_before = Clock.currTime();
// 1.随机分配并填充许多短向量
vector_t [] xs;
xs.length = N;
for(int i = 0; i xs [i] .length = size;
}
writefln(allocation:%i,(Clock.currTime() - tm_before));
tm_before = Clock.currTime();
for(int i = 0; i for(int j = 0; j xs [ i] [j] = uniform(-1000,1000);
writefln(random:%i,(Clock.currTime() - tm_before));
tm_before = Clock.currTime();
// 2.计算所有成对标量积:
result_type avg = cast(result_type)0;
for(int i = 0; i for(int j = 0; j avg + = scalar_product [i],xs [j]);
avg = avg / N * N;
writefln(result:%d,avg);
auto time = Clock.currTime() - tm_before;
writefln(scalar products:%i,time);
return 0;
}
启用所有优化并停用所有安全检查,使用以下DMD标志编译D程序:
-O -inline -release-noboundscheck
:我已经尝试过您的程序与g ++,dmd和gdc。 dmd滞后,但gdc的性能非常接近g ++。我使用的命令行是 gdmd -O -release -inline
(gdmd是gdc的一个包装,接受dmd选项)。
查看汇编器列表,它看起来既不是dmd也不是gdc内联的 scalar_product
,但g ++ / gdc发出MMX指令,因此他们可能是自动矢量化循环。
I like some features of D, but would be interested if they come with a
runtime penalty?
To compare, I implemented a simple program that computes scalar products of many short vectors both in C++ and in D. The result is surprising:
- D: 18.9 s [see below for final runtime]
- C++: 3.8 s
Is C++ really almost five times as fast or did I make a mistake in the D
program?
I compiled C++ with g++ -O3 (gcc-snapshot 2011-02-19) and D with dmd -O (dmd 2.052) on a moderate recent linux desktop. The results are reproducible over several runs and standard deviations negligible.
Here the C++ program:
#include <iostream>
#include <random>
#include <chrono>
#include <string>
#include <vector>
#include <array>
typedef std::chrono::duration<long, std::ratio<1, 1000>> millisecs;
template <typename _T>
long time_since(std::chrono::time_point<_T>& time) {
long tm = std::chrono::duration_cast<millisecs>( std::chrono::system_clock::now() - time).count();
time = std::chrono::system_clock::now();
return tm;
}
const long N = 20000;
const int size = 10;
typedef int value_type;
typedef long long result_type;
typedef std::vector<value_type> vector_t;
typedef typename vector_t::size_type size_type;
inline value_type scalar_product(const vector_t& x, const vector_t& y) {
value_type res = 0;
size_type siz = x.size();
for (size_type i = 0; i < siz; ++i)
res += x[i] * y[i];
return res;
}
int main() {
auto tm_before = std::chrono::system_clock::now();
// 1. allocate and fill randomly many short vectors
vector_t* xs = new vector_t [N];
for (int i = 0; i < N; ++i) {
xs[i] = vector_t(size);
}
std::cerr << "allocation: " << time_since(tm_before) << " ms" << std::endl;
std::mt19937 rnd_engine;
std::uniform_int_distribution<value_type> runif_gen(-1000, 1000);
for (int i = 0; i < N; ++i)
for (int j = 0; j < size; ++j)
xs[i][j] = runif_gen(rnd_engine);
std::cerr << "random generation: " << time_since(tm_before) << " ms" << std::endl;
// 2. compute all pairwise scalar products:
time_since(tm_before);
result_type avg = 0;
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j)
avg += scalar_product(xs[i], xs[j]);
avg = avg / N*N;
auto time = time_since(tm_before);
std::cout << "result: " << avg << std::endl;
std::cout << "time: " << time << " ms" << std::endl;
}
And here the D version:
import std.stdio;
import std.datetime;
import std.random;
const long N = 20000;
const int size = 10;
alias int value_type;
alias long result_type;
alias value_type[] vector_t;
alias uint size_type;
value_type scalar_product(const ref vector_t x, const ref vector_t y) {
value_type res = 0;
size_type siz = x.length;
for (size_type i = 0; i < siz; ++i)
res += x[i] * y[i];
return res;
}
int main() {
auto tm_before = Clock.currTime();
// 1. allocate and fill randomly many short vectors
vector_t[] xs;
xs.length = N;
for (int i = 0; i < N; ++i) {
xs[i].length = size;
}
writefln("allocation: %i ", (Clock.currTime() - tm_before));
tm_before = Clock.currTime();
for (int i = 0; i < N; ++i)
for (int j = 0; j < size; ++j)
xs[i][j] = uniform(-1000, 1000);
writefln("random: %i ", (Clock.currTime() - tm_before));
tm_before = Clock.currTime();
// 2. compute all pairwise scalar products:
result_type avg = cast(result_type) 0;
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j)
avg += scalar_product(xs[i], xs[j]);
avg = avg / N*N;
writefln("result: %d", avg);
auto time = Clock.currTime() - tm_before;
writefln("scalar products: %i ", time);
return 0;
}
解决方案 To enable all optimizations and disable all safety checks, compile your D program with the following DMD flags:
-O -inline -release -noboundscheck
EDIT: I've tried your programs with g++, dmd and gdc. dmd does lag behind, but gdc achieves performance very close to g++. The commandline I used was gdmd -O -release -inline
(gdmd is a wrapper around gdc which accepts dmd options).
Looking at the assembler listing, it looks like neither dmd nor gdc inlined scalar_product
, but g++/gdc did emit MMX instructions, so they might be auto-vectorizing the loop.
这篇关于D与C ++相比有多快?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!