当我试图在程序中调用多个mpi-send或mpi-recv时,可执行文件将在节点和根目录中被挂起。即,当它试图执行第二个mpi_send或mpi_recv时,通信被阻塞。同时,二进制文件在计算机中以100%的速度运行。
当我试图用OpenMPI1.6.364bit在Windows764bit中运行此代码时,它运行成功。但在Linux中,也就是说,使用OpenMPI1.6.3-64位的CentOS6.3x86_u64也不能运行相同的代码。我做了什么事?
在下面发布代码

#include <mpi.h>

int main(int argc, char** argv) {
MPI::Init();
int rank = MPI::COMM_WORLD.Get_rank();
int size = MPI::COMM_WORLD.Get_size();
char name[256] = { };
int len = 0;
MPI::Get_processor_name(name, len);

printf("Hi I'm %s:%d\n", name, rank);

if (rank == 0)
{
    while (size >= 1)
    {
        int val, stat = 1;
        MPI::Status status;
        MPI::COMM_WORLD.Recv(&val, 1, MPI::INT, 1, 0, status);
        int source = status.Get_source();
        printf("%s:%d received %d from %d\n", name, rank, val, source);

        MPI::COMM_WORLD.Send(&stat, 1, MPI::INT, 1, 2);
        printf("%s:%d sent status %d\n", name, rank, stat);

        size--;
    }
} else
{
    int val = rank + 10;
    int stat = 0;
    printf("%s:%d sending %d...\n", name, rank, val);
    MPI::COMM_WORLD.Send(&val, 1, MPI::INT, 0, 0);
    printf("%s:%d sent %d\n", name, rank, val);

    MPI::Status status;
    MPI::COMM_WORLD.Recv(&stat, 1, MPI::INT, 0, 2, status);
    int source = status.Get_source();
    printf("%s:%d received status %d from %d\n", name, rank, stat, source);
}

size = MPI::COMM_WORLD.Get_size();
if (rank == 0)
{
    while (size >= 1)
    {
        int val, stat = 1;
        MPI::Status status;

        MPI::COMM_WORLD.Recv(&val, 1, MPI::INT, 1, 1, status);
        int source = status.Get_source();
        printf("%s:0 received %d from %d\n", name, val, source);

        size--;
    }

    printf("all workers checked in!\n");
}
else
{
    int val = rank + 10 + 5;
    printf("%s:%d sending %d...\n", name, rank, val);
    MPI::COMM_WORLD.Send(&val, 1, MPI::INT, 0, 1);
    printf("%s:%d sent %d\n", name, rank, val);
}
MPI::Finalize();

return 0;

}
嗨,hristo,我已经按照你说的修改了源代码,代码又发布了
    #include <mpi.h>
#include <stdio.h>

int main(int argc, char** argv)
{
    int iNumProcess = 0, iRank = 0, iNameLen = 0, n;
    char szNodeName[MPI_MAX_PROCESSOR_NAME] = {};
    MPI_Status stMPIStatus;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &iNumProcess);
    MPI_Comm_rank(MPI_COMM_WORLD, &iRank);
    MPI_Get_processor_name(szNodeName, &iNameLen);

    printf("Hi I'm %s:%d\n", szNodeName, iRank);

    if (iRank == 0)
    {
        int iNode = 1;
        while (iNumProcess > 1)
        {
            int iVal = 0, iStat = 1;
            MPI_Recv(&iVal, 1, MPI_INT, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &stMPIStatus);
            printf("%s:%d received %d\n", szNodeName, iRank, iVal);

            MPI_Send(&iStat, 1, MPI_INT, iNode, 1, MPI_COMM_WORLD);
            printf("%s:%d sent Status %d\n", szNodeName, iRank, iStat);

            MPI_Recv(&iVal, 1, MPI_INT, MPI_ANY_SOURCE, 2, MPI_COMM_WORLD, &stMPIStatus);
            printf("%s:%d received %d\n", szNodeName, iRank, iVal);

            iNumProcess--;
            iNode++;
        }

        printf("all workers checked in!\n");
    }
    else
    {
        int iVal = iRank + 10;
        int iStat = 0;
        printf("%s:%d sending %d...\n", szNodeName, iRank, iVal);
        MPI_Send(&iVal, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
        printf("%s:%d sent %d\n", szNodeName, iRank, iVal);

        MPI_Recv(&iStat, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &stMPIStatus);
        printf("%s:%d received status %d\n", szNodeName, iRank, iVal);

        iVal = 20;
        printf("%s:%d sending %d...\n", szNodeName, iRank, iVal);
        MPI_Send(&iVal, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
        printf("%s:%d sent %d\n", szNodeName, iRank, iVal);

    }

    MPI_Finalize();

    return 0;
}

我得到的输出是folows。也就是说,在发送/接收之后,根节点无限地等待,节点以100%的CPU利用率崩溃。其输出如下
Hi I'm N1433:1
N1433:1 sending 11...
Hi I'm N1425:0
N1425:0 received 11
N1425:0 sent Status 1
N1433:1 sent 11
N1433:1 received status 11
N1433:1 sending 20...

这里的N1433和N1425是机器名。请帮忙

最佳答案

主人的密码是错误的。它总是发送和等待来自同一秩-秩1的消息。因此,程序只有在以mpiexec -np 2 ...运行时才能正常运行。您可能想做的是使用MPI_ANY_SOURCE作为源列组,然后在send操作中使用该源列组作为目标。你也不应该使用while (size >= 1),因为rank0不与自己交谈,并且通信的数量预计小于size

if (rank == 0)
{
    while (size > 1)
    //     ^^^^^^^^
    {
        int val, stat = 1;
        MPI::Status status;
        MPI::COMM_WORLD.Recv(&val, 1, MPI::INT, MPI_ANY_SOURCE, 0, status);
        // Use wildcard source here ------------^^^^^^^^^^^^^^
        int source = status.Get_source();
        printf("%s:%d received %d from %d\n", name, rank, val, source);

        MPI::COMM_WORLD.Send(&stat, 1, MPI::INT, source, 2);
        // Send back to the same process --------^^^^^^
        printf("%s:%d sent status %d\n", name, rank, stat);

        size--;
    }
} else

在工人身上做这样的事是没有意义的:
MPI::Status status;
MPI::COMM_WORLD.Recv(&stat, 1, MPI::INT, 0, 2, status);
// Source rank is fixed here ------------^
int source = status.Get_source();
printf("%s:%d received status %d from %d\n", name, rank, stat, source);

您已经在接收操作中将rank0指定为源,因此它只能从rank0接收消息。除非发生了通信错误,否则status.Get_source()无法返回0以外的任何值,在这种情况下MPI::COMM_WORLD.Recv()将引发异常。
代码中的第二个循环也是如此。
顺便说一下,您使用的是官方标准C++绑定。MPI-2.2不支持它们,最新版本的标准(MPI-3.0)完全删除了它们,因为MPI论坛不再支持它们。您应该使用C绑定,或者依赖于第三方C++接口,比如Boost.MPI

关于linux - OpenMPI多个MPI_Send和MPI_recv无法正常工作,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/13409949/

10-16 09:11