D与C ++相比有多快？

本文介绍了D与C ++相比有多快？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我喜欢D的某些功能，但是如果他们有一个
运行时代码会感兴趣？

为了比较，我实现了一个简单的程序在C ++和D中计算许多短向量的标量积。结果是令人惊讶的：

D：18.9 s [ runtime]

C ++：3.8 s

C ++的速度几乎是游戏的五倍我在D
程序中犯了一个错误？

我用g ++ -O3（gcc-snapshot 2011-02-19）编译C ++，用dmd编译D： O（dmd 2.052）在最近的linux桌面上。

这里是C ++程序：

  #include< iostream> 
 #include< random> 
 #include< chrono> 
 #include< string> 
 
 #include< vector> 
 #include< array> 
 
 typedef std :: chrono :: duration< long，std :: ratio< 1,1000>毫秒; 
 template< typename _T> 
 long time_since（std :: chrono :: time_point< _T>& time）{
 long tm = std :: chrono :: duration_cast< millisecs>（std :: chrono :: system_clock :: now （） -  time）.count（）; 
 time = std :: chrono :: system_clock :: now（）; 
 return tm; 
} 
 
 const long N = 20000; 
 const int size = 10; 
 
 typedef int value_type; 
 typedef long long result_type; 
 typedef std :: vector< value_type> vector_t; 
 typedef typename vector_t :: size_type size_type; 
 
 inline value_type scalar_product（const vector_t& x，const vector_t& y）{
 value_type res = 0; 
 size_type siz = x.size（）; 
 for（size_type i = 0; i  res + = x [i] * y [i]; 
 return res; 
} 
 
 int main（）{
 auto tm_before = std :: chrono :: system_clock :: now（）; 
 
 // 1.随机分配并填充许多短向量
 vector_t * xs = new vector_t [N]; 
 for（int i = 0; i  xs [i] = vector_t（size）; 
} 
 std :: cerr<< allocation：< time_since（tm_before）<< ms< std :: endl; 
 
 std :: mt19937 rnd_engine; 
 std :: uniform_int_distribution< value_type> runif_gen（-1000，1000）; 
 for（int i = 0; i  for（int j = 0; j  xs [i] [j ] = runif_gen（rnd_engine）; 
 std :: cerr<< 随机生成：< time_since（tm_before）<< ms< std :: endl; 
 
 // 2.计算所有成对标量积：
 time_since（tm_before）; 
 result_type avg = 0; 
 for（int i = 0; i  for（int j = 0; j  avg + = scalar_product [i]，xs [j]）; 
 avg = avg / N * N; 
 auto time = time_since（tm_before）; 
 std :: cout<< result：< avg<< std :: endl; 
 std :: cout<< time：<<时间< ms< std :: endl; 
}

此处的D版本：

  import std.stdio; 
 import std.datetime; 
 import std.random; 
 
 const long N = 20000; 
 const int size = 10; 
 
别名int value_type; 
 alias long result_type; 
 alias value_type [] vector_t; 
 alias uint size_type; 
 
 value_type scalar_product（const ref vector_t x，const ref vector_t y）{
 value_type res = 0; 
 size_type siz = x.length; 
 for（size_type i = 0; i  res + = x [i] * y [i]; 
 return res; 
} 
 
 int main（）{
 auto tm_before = Clock.currTime（）; 
 
 // 1.随机分配并填充许多短向量
 vector_t [] xs; 
 xs.length = N; 
 for（int i = 0; i  xs [i] .length = size; 
} 
 writefln（allocation：％i，（Clock.currTime（） -  tm_before））; 
 tm_before = Clock.currTime（）; 
 
 for（int i = 0; i  for（int j = 0; j  xs [ i] [j] = uniform（-1000,1000）; 
 writefln（random：％i，（Clock.currTime（） -  tm_before））; 
 tm_before = Clock.currTime（）; 
 
 // 2.计算所有成对标量积：
 result_type avg = cast（result_type）0; 
 for（int i = 0; i  for（int j = 0; j  avg + = scalar_product [i]，xs [j]）; 
 avg = avg / N * N; 
 writefln（result：％d，avg）; 
 auto time = Clock.currTime（） -  tm_before; 
 writefln（scalar products：％i，time）; 
 
 return 0; 
}

解决方案

启用所有优化并停用所有安全检查，使用以下DMD标志编译D程序：

  -O -inline -release-noboundscheck 
  ：我已经尝试过您的程序与g ++，dmd和gdc。
 dmd滞后，但gdc的性能非常接近g ++。我使用的命令行是 gdmd -O -release -inline （gdmd是gdc的一个包装，接受dmd选项）。
 
 
 查看汇编器列表，它看起来既不是dmd也不是gdc内联的 scalar_product ，但g ++ / gdc发出MMX指令，因此他们可能是自动矢量化循环。
 
I like some features of D, but would be interested if they come with a
runtime penalty?

To compare, I implemented a simple program that computes scalar products of many short vectors both in C++ and in D. The result is surprising:


D:   18.9 s   [see below for final runtime] 
C++:  3.8 s


Is C++ really almost five times as fast or did I make a mistake in the D
program?

I compiled C++ with g++ -O3 (gcc-snapshot 2011-02-19) and D with dmd -O (dmd 2.052) on a moderate recent linux desktop. The results are reproducible over several runs and standard deviations negligible.

Here the C++ program:
#include <iostream>
#include <random>
#include <chrono>
#include <string>

#include <vector>
#include <array>

typedef std::chrono::duration<long, std::ratio<1, 1000>> millisecs;
template <typename _T>
long time_since(std::chrono::time_point<_T>& time) {
      long tm = std::chrono::duration_cast<millisecs>( std::chrono::system_clock::now() - time).count();
  time = std::chrono::system_clock::now();
  return tm;
}

const long N = 20000;
const int size = 10;

typedef int value_type;
typedef long long result_type;
typedef std::vector<value_type> vector_t;
typedef typename vector_t::size_type size_type;

inline value_type scalar_product(const vector_t& x, const vector_t& y) {
  value_type res = 0;
  size_type siz = x.size();
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = std::chrono::system_clock::now();

  // 1. allocate and fill randomly many short vectors
  vector_t* xs = new vector_t [N];
  for (int i = 0; i < N; ++i) {
    xs[i] = vector_t(size);
      }
  std::cerr << "allocation: " << time_since(tm_before) << " ms" << std::endl;

  std::mt19937 rnd_engine;
  std::uniform_int_distribution<value_type> runif_gen(-1000, 1000);
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = runif_gen(rnd_engine);
  std::cerr << "random generation: " << time_since(tm_before) << " ms" << std::endl;

  // 2. compute all pairwise scalar products:
  time_since(tm_before);
  result_type avg = 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j)
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  auto time = time_since(tm_before);
  std::cout << "result: " << avg << std::endl;
  std::cout << "time: " << time << " ms" << std::endl;
}
And here the D version:
import std.stdio;
import std.datetime;
import std.random;

const long N = 20000;
const int size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_t;
alias uint size_type;

value_type scalar_product(const ref vector_t x, const ref vector_t y) {
  value_type res = 0;
  size_type siz = x.length;
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = Clock.currTime();

  // 1. allocate and fill randomly many short vectors
  vector_t[] xs;
  xs.length = N;
  for (int i = 0; i < N; ++i) {
    xs[i].length = size;
  }
  writefln("allocation: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = uniform(-1000, 1000);
  writefln("random: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  // 2. compute all pairwise scalar products:
  result_type avg = cast(result_type) 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j)
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  writefln("result: %d", avg);
  auto time = Clock.currTime() - tm_before;
  writefln("scalar products: %i ", time);

  return 0;
}
 解决方案 
To enable all optimizations and disable all safety checks, compile your D program with the following DMD flags:
-O -inline -release -noboundscheck
EDIT: I've tried your programs with g++, dmd and gdc. dmd does lag behind, but gdc achieves performance very close to g++. The commandline I used was gdmd -O -release -inline (gdmd is a wrapper around gdc which accepts dmd options).
Looking at the assembler listing, it looks like neither dmd nor gdc inlined scalar_product, but g++/gdc did emit MMX instructions, so they might be auto-vectorizing the loop.
                        这篇关于D与C ++相比有多快？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！