本文介绍了D与C ++相比有多快?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我喜欢D的某些功能,但是如果他们有一个
运行时代码会感兴趣?



为了比较,我实现了一个简单的程序在C ++和D中计算许多短向量的标量积。结果是令人惊讶的:




  • D:18.9 s [ runtime]

  • C ++:3.8 s



C ++的速度几乎是游戏的五倍我在D
程序中犯了一个错误?



我用g ++ -O3(gcc-snapshot 2011-02-19)编译C ++,用dmd编译D: O(dmd 2.052)在最近的linux桌面上。



这里是C ++程序:

  #include< iostream> 
#include< random>
#include< chrono>
#include< string>

#include< vector>
#include< array>

typedef std :: chrono :: duration< long,std :: ratio< 1,1000>毫秒;
template< typename _T>
long time_since(std :: chrono :: time_point< _T>& time){
long tm = std :: chrono :: duration_cast< millisecs>(std :: chrono :: system_clock :: now () - time).count();
time = std :: chrono :: system_clock :: now();
return tm;
}

const long N = 20000;
const int size = 10;

typedef int value_type;
typedef long long result_type;
typedef std :: vector< value_type> vector_t;
typedef typename vector_t :: size_type size_type;

inline value_type scalar_product(const vector_t& x,const vector_t& y){
value_type res = 0;
size_type siz = x.size();
for(size_type i = 0; i res + = x [i] * y [i];
return res;
}

int main(){
auto tm_before = std :: chrono :: system_clock :: now();

// 1.随机分配并填充许多短向量
vector_t * xs = new vector_t [N];
for(int i = 0; i xs [i] = vector_t(size);
}
std :: cerr<< allocation:< time_since(tm_before)<< ms< std :: endl;

std :: mt19937 rnd_engine;
std :: uniform_int_distribution< value_type> runif_gen(-1000,1000);
for(int i = 0; i for(int j = 0; j xs [i] [j ] = runif_gen(rnd_engine);
std :: cerr<< 随机生成:< time_since(tm_before)<< ms< std :: endl;

// 2.计算所有成对标量积:
time_since(tm_before);
result_type avg = 0;
for(int i = 0; i for(int j = 0; j avg + = scalar_product [i],xs [j]);
avg = avg / N * N;
auto time = time_since(tm_before);
std :: cout<< result:< avg<< std :: endl;
std :: cout<< time:<<时间< ms< std :: endl;
}

此处的D版本:

  import std.stdio; 
import std.datetime;
import std.random;

const long N = 20000;
const int size = 10;

别名int value_type;
alias long result_type;
alias value_type [] vector_t;
alias uint size_type;

value_type scalar_product(const ref vector_t x,const ref vector_t y){
value_type res = 0;
size_type siz = x.length;
for(size_type i = 0; i res + = x [i] * y [i];
return res;
}

int main(){
auto tm_before = Clock.currTime();

// 1.随机分配并填充许多短向量
vector_t [] xs;
xs.length = N;
for(int i = 0; i xs [i] .length = size;
}
writefln(allocation:%i,(Clock.currTime() - tm_before));
tm_before = Clock.currTime();

for(int i = 0; i for(int j = 0; j xs [ i] [j] = uniform(-1000,1000);
writefln(random:%i,(Clock.currTime() - tm_before));
tm_before = Clock.currTime();

// 2.计算所有成对标量积:
result_type avg = cast(result_type)0;
for(int i = 0; i for(int j = 0; j avg + = scalar_product [i],xs [j]);
avg = avg / N * N;
writefln(result:%d,avg);
auto time = Clock.currTime() - tm_before;
writefln(scalar products:%i,time);

return 0;
}


解决方案

启用所有优化并停用所有安全检查,使用以下DMD标志编译D程序:

  -O -inline -release-noboundscheck 
:我已经尝试过您的程序与g ++,dmd和gdc。

dmd滞后,但gdc的性能非常接近g ++。我使用的命令行是 gdmd -O -release -inline (gdmd是gdc的一个包装,接受dmd选项)。



查看汇编器列表,它看起来既不是dmd也不是gdc内联的 scalar_product ,但g ++ / gdc发出MMX指令,因此他们可能是自动矢量化循环。


I like some features of D, but would be interested if they come with a runtime penalty?

To compare, I implemented a simple program that computes scalar products of many short vectors both in C++ and in D. The result is surprising:

  • D: 18.9 s [see below for final runtime]
  • C++: 3.8 s

Is C++ really almost five times as fast or did I make a mistake in the D program?

I compiled C++ with g++ -O3 (gcc-snapshot 2011-02-19) and D with dmd -O (dmd 2.052) on a moderate recent linux desktop. The results are reproducible over several runs and standard deviations negligible.

Here the C++ program:

#include <iostream>
#include <random>
#include <chrono>
#include <string>

#include <vector>
#include <array>

typedef std::chrono::duration<long, std::ratio<1, 1000>> millisecs;
template <typename _T>
long time_since(std::chrono::time_point<_T>& time) {
      long tm = std::chrono::duration_cast<millisecs>( std::chrono::system_clock::now() - time).count();
  time = std::chrono::system_clock::now();
  return tm;
}

const long N = 20000;
const int size = 10;

typedef int value_type;
typedef long long result_type;
typedef std::vector<value_type> vector_t;
typedef typename vector_t::size_type size_type;

inline value_type scalar_product(const vector_t& x, const vector_t& y) {
  value_type res = 0;
  size_type siz = x.size();
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = std::chrono::system_clock::now();

  // 1. allocate and fill randomly many short vectors
  vector_t* xs = new vector_t [N];
  for (int i = 0; i < N; ++i) {
    xs[i] = vector_t(size);
      }
  std::cerr << "allocation: " << time_since(tm_before) << " ms" << std::endl;

  std::mt19937 rnd_engine;
  std::uniform_int_distribution<value_type> runif_gen(-1000, 1000);
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = runif_gen(rnd_engine);
  std::cerr << "random generation: " << time_since(tm_before) << " ms" << std::endl;

  // 2. compute all pairwise scalar products:
  time_since(tm_before);
  result_type avg = 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j)
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  auto time = time_since(tm_before);
  std::cout << "result: " << avg << std::endl;
  std::cout << "time: " << time << " ms" << std::endl;
}

And here the D version:

import std.stdio;
import std.datetime;
import std.random;

const long N = 20000;
const int size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_t;
alias uint size_type;

value_type scalar_product(const ref vector_t x, const ref vector_t y) {
  value_type res = 0;
  size_type siz = x.length;
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = Clock.currTime();

  // 1. allocate and fill randomly many short vectors
  vector_t[] xs;
  xs.length = N;
  for (int i = 0; i < N; ++i) {
    xs[i].length = size;
  }
  writefln("allocation: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = uniform(-1000, 1000);
  writefln("random: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  // 2. compute all pairwise scalar products:
  result_type avg = cast(result_type) 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j)
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  writefln("result: %d", avg);
  auto time = Clock.currTime() - tm_before;
  writefln("scalar products: %i ", time);

  return 0;
}
解决方案

To enable all optimizations and disable all safety checks, compile your D program with the following DMD flags:

-O -inline -release -noboundscheck

EDIT: I've tried your programs with g++, dmd and gdc. dmd does lag behind, but gdc achieves performance very close to g++. The commandline I used was gdmd -O -release -inline (gdmd is a wrapper around gdc which accepts dmd options).

Looking at the assembler listing, it looks like neither dmd nor gdc inlined scalar_product, but g++/gdc did emit MMX instructions, so they might be auto-vectorizing the loop.

这篇关于D与C ++相比有多快?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

09-15 12:21