我目前正在寻找一个C++库(最好仅用于 header ),以像记法这样的爱因斯坦求和的高阶张量收缩。

Fastor(https://github.com/romeric/Fastor)似乎非常适合,并且由于Eigen(我经常使用)具有张量模块,因此我做了一个小比较,包括一个简单循环实现的基准:

#include <Fastor.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include <ctime>

int main() {
clock_t tstart, tend;
 {
    using namespace Fastor;
    Tensor<double,20,50,10> A;
    Tensor<double,50,10,20,4> B;
    Tensor<double, 4> C;
    Tensor<double, 10, 10, 4> D;

    A.ones();
    B.ones();
    C.zeros();
    D.zeros();
    enum {I,J,K,L,M,N};

    tstart = clock();

    C += einsum<Index<I,J,K>,Index<J,K,I,M>>(A,B);

    tend = clock();
    std::cout << "Fastor, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();

    D += einsum<Index<I,J,K>,Index<J,M,I,N>>(A,B);

    tend = clock();
    std::cout << "Fastor, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
 }

 {
     using namespace Eigen;

    TensorFixedSize<double, Sizes<20, 50, 10>> A;
    TensorFixedSize<double, Sizes<50,10, 20, 4>> B;
    TensorFixedSize<double, Sizes<4>> C;
    TensorFixedSize<double, Sizes<10,10,4>> D;

    A.setConstant(1);
    B.setConstant(1);
    C.setConstant(0);
    D.setConstant(0);

     array<IndexPair<int>,3> IJK_JKIM = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
         IndexPair<int>(2, 1),
     };

     array<IndexPair<int>,2> IJK_JMIN = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
     };

    tstart = clock();
    C += A.contract(  B,  IJK_JKIM) ;
    tend = clock();

    std::cout << "Eigen, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();
     D += A.contract(  B,  IJK_JMIN) ;
    tend = clock();

    std::cout << "Eigen, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;


 }

 {
     using namespace Eigen;

     double A [20][50][10]  = {1} ;
     double B [50][10][20][4]  = {1};
     double C [4]  = {};
     double D [10][10][4]  = {};

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int l = 0; l < 4; l++)
                    B[j][k][i][l] = 1.0;


    tstart = clock();

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int l = 0; l < 4; l++)
                    C[l] += A[i][j][k] * B [j][k][i][l];

    tend = clock();
    std::cout << "CTran, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();
    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int m = 0; m < 10; m++)
                    for ( int n = 0; n < 4; n++)
                        D[k][m][n] += A[i][j][k] * B [j][m][i][n];

    tend = clock();

    std::cout << "CTran, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;

 }

return 0;
}

对我来说输出是:
Fastor, C_M = A_IJK * B_JKIM:   206
Fastor, D_KMN = A_IJ * B_JMIN:  2230
Eigen, C_M = A_IJK * B_JKIM:    1286
Eigen, D_KMN = A_IJ * B_JMIN:   898
CTran, C_M = A_IJK * B_JKIM:    2
CTran, D_KMN = A_IJ * B_JMIN:   2

已使用g++ 9.1.0(Arch Linux)进行了编译
g++ test.cpp -O3 -std=c++17  -I Fastor -isystem eigen-eigen-323c052e1731 -o test

因此,似乎Fastor在示例1中比在Eigen上快得多,但在示例2中则慢得多。
但是,这两个库都比幼稚的循环实现慢得多!
这些矛盾的结果有解释吗?先感谢您!

最佳答案

您可以使用Fastor的unusedtimeit函数进行适当的基准测试,而不必担心编译器是否要消除死代码。

我将您的基准编写如下:

constexpr int FIFTY =  50;
constexpr int TWETY =  20;
constexpr int TEN =  10;
constexpr int FOUR =  4;

using real_ = double;

real_ A [TWETY][FIFTY][TEN]  = {1} ;
real_ B [FIFTY][TEN][TWETY][FOUR]  = {1};
real_ C [FOUR]  = {};
real_ D [TEN][TEN][FOUR]  = {};


Fastor::Tensor<real_,TWETY,FIFTY,TEN> fA;
Fastor::Tensor<real_,FIFTY,TEN,TWETY,FOUR> fB;

Eigen::TensorFixedSize<real_, Eigen::Sizes<TWETY, FIFTY, TEN>> eA;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FIFTY,TEN, TWETY, FOUR>> eB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FOUR>> eC;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TEN,TEN,FOUR>> eD;


void CTran_C() {

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    B[j][k][i][l] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    C[l] += A[i][j][k] * B[j][k][i][l];

    Fastor::unused(A);
    Fastor::unused(B);
    Fastor::unused(C);
}

void CTran_D() {

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    B[j][k][i][l] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int m = 0; m < TEN; m++)
                    for ( int n = 0; n < FOUR; n++)
                        D[k][m][n] += A[i][j][k] * B [j][m][i][n];

    Fastor::unused(A);
    Fastor::unused(B);
    Fastor::unused(D);
}


void Fastor_C() {

    using namespace Fastor;
    enum {I,J,K,L,M,N};

    fA.ones();
    fB.ones();

    auto fC = einsum<Index<I,J,K>,Index<J,K,I,M>>(fA,fB);

    unused(fA);
    unused(fB);
    unused(fC);
}

void Fastor_D() {

    using namespace Fastor;
    enum {I,J,K,L,M,N};

    fA.ones();
    fB.ones();

    auto fD = einsum<Index<I,J,K>,Index<J,M,I,N>>(fA,fB);

    unused(fA);
    unused(fB);
    unused(fD);
}


void Eigen_C() {

    using namespace Eigen;

    eA.setConstant(1);
    eB.setConstant(1);

    array<IndexPair<int>,3> IJK_JKIM = {
        IndexPair<int>(0, 2),
        IndexPair<int>(1, 0),
        IndexPair<int>(2, 1),
    };

    eC = eA.contract(  eB,  IJK_JKIM) ;

    Fastor::unused(eA);
    Fastor::unused(eB);
    Fastor::unused(eC);
}


void Eigen_D() {
    using namespace Eigen;

    eA.setConstant(1);
    eB.setConstant(1);

     array<IndexPair<int>,2> IJK_JMIN = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
     };

    eD = eA.contract(  eB,  IJK_JMIN) ;

    Fastor::unused(eA);
    Fastor::unused(eB);
    Fastor::unused(eD);
}


int main() {
    using namespace Fastor;

    print("Time for computing tensor C (CTran, Fastor, Eigen):");
    timeit(CTran_C);
    timeit(Fastor_C);
    timeit(Eigen_C);
    print("Time for computing tensor D (CTran, Fastor, Eigen):");
    timeit(CTran_D);
    timeit(Fastor_D);
    timeit(Eigen_D);

    return 0;
}

使用GCC 9 g++-9 -O3 -DNDEBUG -mfma,Eigen-3.3.7和GitHub上Fastor的最新版本进行编译,这就是我得到的
Time for computing tensor C (CTran, Fastor, Eigen):
32305 runs, average elapsed time is 30.9556 µs. No of CPU cycles 113604
42325 runs, average elapsed time is 23.6269 µs. No of CPU cycles 86643
2585 runs, average elapsed time is 387.003 µs. No of CPU cycles 1429229
Time for computing tensor D (CTran, Fastor, Eigen):
8086 runs, average elapsed time is 123.674 µs. No of CPU cycles 455798
21246 runs, average elapsed time is 47.0713 µs. No of CPU cycles 173044
5890 runs, average elapsed time is 169.793 µs. No of CPU cycles 626651

请注意,编译器会自动对您的CTran代码进行矢量化处理,因为它是带有for循环的直接函数,因此生成的代码非常理想。如您所见,在两种情况下,Fastor的性能均优于其他产品。

但是,如果您确实要对堆栈分配的数据进行基准测试,则应减小张量的大小,因为这些尺寸对于堆栈数据来说是非常不现实的。如果我将张量的大小重新定义为

constexpr int FIFTY =  5;
constexpr int TWETY =  2;
constexpr int TEN =  8;
constexpr int FOUR =  4;

这就是我得到的
Time for computing tensor C (CTran, Fastor, Eigen):
5493029 runs, average elapsed time is 182.049 ns. No of CPU cycles 486
5865156 runs, average elapsed time is 170.498 ns. No of CPU cycles 442
301422 runs, average elapsed time is 3.31761 µs. No of CPU cycles 12032
Time for computing tensor D (CTran, Fastor, Eigen):
207912 runs, average elapsed time is 4.80974 µs. No of CPU cycles 17574
2733602 runs, average elapsed time is 365.818 ns. No of CPU cycles 1164
514351 runs, average elapsed time is 1.94421 µs. No of CPU cycles 6977

为了计算张量D,编译器对于该大小严重错误地编译了CTran代码。请注意,微秒µs和纳秒ns之间的差异。

09-26 03:52