performance - MPI_Send/Recv和MPI_Scatter/Gather的比较

我正在使用MPI分离矩阵并将其发送到N个进程，但是我发现MPI_Scatter / Gather效率不高。我编写了两个程序来比较MPI_Send / Recv和MPI_Scatter / Gather。

MPI_发送/接收：

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0

double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);
double* create_vector(uint32_t n);
int print_matrix(double *m, uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
    double *A, *B, *C, *A_buf, *C_buf;
    double t_start, t_end, buf;
    uint32_t M; //number of rows
    uint32_t N; //number of columns
    uint32_t nrows, size, rank, recv_len;
    MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_SUB_VECTOR;
    MPI_Comm comm;
    MPI_Status status;

    M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
    N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    nrows = M/size;
    //create derived data type
    MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
    MPI_Type_commit(&MPI_MATRIX);
    MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
    MPI_Type_commit(&MPI_VECTOR);
    MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_SUB_VECTOR);
    MPI_Type_commit(&MPI_SUB_VECTOR);

    if(rank == MASTER)
    {
        //A: M*N
        A = create_matrix(M, N);
        C = create_matrix(M, 1);

        if(A == NULL || C == NULL)
        {
            printf( "Allocation of matrix failed.\n" );
            exit(EXIT_FAILURE);
        }
    }

    B = create_vector(N);
    A_buf = create_matrix(nrows, N);
    C_buf = zero_matrix(nrows, 1);

    if(B == NULL || A_buf == NULL || C_buf == NULL)
    {
        printf( "Allocation of matrix failed.\n" );
        exit(EXIT_FAILURE);
    }

    if(rank == MASTER)
    {
        //exclude the time of establishing TCP connections
        for(int i = 1;i < size;i++)
            MPI_Send(&buf, 1, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);

        t_start = MPI_Wtime();
        for(int i = 0;i < nrows*N;i++)
            A_buf[i] = A[i];

        //send submatrix to other processes
        for(int i = 1;i < size;i++)
        {
            MPI_Send(&A[i*nrows*N], 1, MPI_MATRIX, i, 0, MPI_COMM_WORLD);
            MPI_Send(B, 1, MPI_VECTOR, i, 0, MPI_COMM_WORLD);
        }
    }
    else
    {
        //receive to establish connection with MASTER
        MPI_Recv(&buf, 1, MPI_DOUBLE, MASTER, 0, MPI_COMM_WORLD, &status);

        //receive matrix
        MPI_Recv(A_buf, 1, MPI_MATRIX, MASTER, 0, MPI_COMM_WORLD, &status);
        MPI_Recv(B, 1, MPI_VECTOR, MASTER, 0, MPI_COMM_WORLD, &status);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        for(int i = 0;i < nrows;i++)
            C[i] = C_buf[i];

        for(int i = 1;i < size;i++)
            MPI_Recv(&C[i*nrows], 1, MPI_SUB_VECTOR, i, 0, MPI_COMM_WORLD, &status);

        t_end = MPI_Wtime();
        printf("%dx%d/%d: %7.4f\n", M, N, size, t_end - t_start);
    }
    else
    {
        MPI_Send(C_buf, 1, MPI_SUB_VECTOR, MASTER, 0, MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Type_free(&MPI_MATRIX);
    MPI_Type_free(&MPI_VECTOR);
    MPI_Type_free(&MPI_SUB_VECTOR);

    if(rank == MASTER)
    {
        free(A);
        free(C);
    }

    free(B);
    free(A_buf);
    free(C_buf);

    MPI_Finalize();

    return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
    double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    srand((unsigned)time(NULL));

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)1;
    }

    return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
    double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)0;
    }

    return matrix;
}

double* create_vector(uint32_t n)
{
    return create_matrix(n, 1);
}

MPI_Scatter / Gather：

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0
double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
    double t_start, t_end, buf;
    double *A, *B, *C, *A_buf, *C_buf;
    uint32_t M; //number of rows
    uint32_t N; //number of columns
    uint32_t nrows, size, rank;
    uint32_t i_start, i_end;
    MPI_Comm comm;
    MPI_Status status;
    MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_RESULT;

    M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
    N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    nrows = M/size;
    //create derived data type
    MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
    MPI_Type_commit(&MPI_MATRIX);
    MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
    MPI_Type_commit(&MPI_VECTOR);
    MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_RESULT);
    MPI_Type_commit(&MPI_RESULT);

    if(rank == MASTER)
    {
        //A: M*N
        A = zero_matrix(M, N);
        C = create_matrix(M, 1);

        if(A == NULL || C == NULL)
        {
            printf( "Allocation of matrix failed.\n" );
            exit(EXIT_FAILURE);
        }
    }

    B = zero_matrix(N, 1);
    A_buf = create_matrix(nrows, N);
    C_buf = create_matrix(nrows, 1);

    if(B == NULL || A_buf == NULL || C_buf == NULL)
    {
        printf( "Allocation of matrix failed.\n" );
        exit(EXIT_FAILURE);
    }

    //exclude the time of establishing TCP connections
    MPI_Bcast(&buf, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        t_start = MPI_Wtime();
    }

    // scatter A
    MPI_Scatter(A, 1, MPI_MATRIX, A_buf, 1, MPI_MATRIX, 0, MPI_COMM_WORLD);

    // broadcast B
    MPI_Bcast(B, 1, MPI_VECTOR, 0, MPI_COMM_WORLD);

    // gather C
    MPI_Gather(C_buf, 1, MPI_RESULT, C, 1, MPI_RESULT, 0, MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        t_end = MPI_Wtime();
        printf("%d %7.4f\n", size, t_end - t_start);

        free(A);
        free(C);
    }

    MPI_Type_free(&MPI_MATRIX);
    MPI_Type_free(&MPI_VECTOR);
    MPI_Type_free(&MPI_RESULT);

    free(B);
    free(A_buf);
    free(C_buf);

    return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
    double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    srand((unsigned)time(NULL));

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)rand();
    }

    return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
    double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)1;
    }

    return matrix;
}

我使用以下脚本来同时运行它们：

#!/bin/bash
dims="4096"
ntasks="1 2 4 8"
echo -n "" > log
for dim in $dims;
do
    echo "dim=$dim:"
    for n in $ntasks;
    do
        srun --ntasks=$n --ntasks-per-node=1 --cpu-freq=2900000 ./matrix $dim $dim | tee -a log
    done
done

转移时间：

program        |   N=1   |   N=2   |   N=4   |   N=8   |
--------------------------------------------------------
send/recv      | 0.0684s | 0.0638s | 0.0654s | 0.0638s |
scatter/gather | 0.0367s | 0.0492s | 0.0765s | 0.1283s |

散布/聚集数据传输时间增长得如此之快，我是否还有理由使用它而不是发送/接收循环？我知道分散是发送的包装，收集是recv的包装，但是它们还做什么？

最佳答案

为了澄清起见，MPI_Scatter和MPI_Gather（很可能）都在后台使用MPI_Send和MPI_Recv。

从您的代码示例中，您似乎似乎不太了解MPI的工作原理：

您无需执行接收或发送操作即可“建立连接”。 MPI操作通常会隐式地进行处理。
在您的Gather / Scatter示例中，首先使用MPI_Scatter分发数据，然后使用MPI_Bcast广播更多数据，然后再次使用MPI_Gather收集数据，而无需进行任何计算。
在您的示例中，您不需要与MPI_Barrier进行任何显式同步。

正确构造程序后，您将看到性能上的巨大提高。除了这些问题之外，MPI还存在一个问题：不幸的是，MPI标准不提供任何性能保证，而是将其留给实际的实现以尽最大可能。 MPI_Scatter / Gather，根据您使用的实际实现，我尝试针对大型消息和/或大量进程进行优化-这自然会带来一些开销。

您可以尝试使用其他MPI实现（对于开放源代码，请参见MVARPICH），以查看您当前正在使用的MPI是否做得不好。但是，只有在正确编写代码后，进行调查才有意义。

另外，最好不要使用前缀MPI_。它使您的代码难以阅读，如果我没记错的话，MPI标准会保留MPI库函数的前缀。