C，开放MPI：从通话分割过错MPI_Finalize（）。段错误并不总是发生，特别是随着工艺低的数字

本文介绍了C，开放MPI：从通话分割过错MPI_Finalize（）。段错误并不总是发生，特别是随着工艺低的数字的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我写一个简单的code来学习如何定义一个MPI_Datatype并与MPI_Gatherv一起使用。我想确保我能上的进程，这似乎是做工精细结合可变长度，结构化数据的动态分配数组，直到我呼吁MPI_Finalize（）。我已经确认这是哪里出了问题首先使用打印语句和Eclipse PTP调试器体现出来（后端GDB-MI）。我的主要问题是，我怎么能摆脱分段错误的？

该段错误不会发生我每次运行code时间。例如，它并没有发生为2或3的过程，但往往当我用约4个或更多的进程运行经常发生。

此外，当我运行这个code。与Valgrind的，不会发生分段错误。不过，我得到的valgrind错误信息，但输出的是我很难理解，当我使用MPI函数，即使有大量的有针对性的SUP pressions的。我也担心，如果我用更多的SUP pressions，我会沉默有用的错误消息。

我编译使用这些标志正常code，所以我使用两种情况下的C99标准：
-ansi -pedantic -Wall -O2 -march =巴塞罗那-fomit-frame-pointer的-std = C99
和调试code有：
-ansi -pedantic -std = C99 -Wall -g

两者都使用GCC 4.4编译器mpicc，并使用Red Hat Linux和Open MPI v1.4.5在集群上运行。请让我知道如果我离开了其他信息的重要位。这里是code，并在此先感谢：

  //＃包括LT＆;＆unistd.h中GT;
＃包括LT＆;＆string.h中GT;
＃包括LT＆;＆stdio.h中GT;
＃包括LT＆;＆math.h中GT;
＃包括LT＆;＆stdlib.h中GT;
//＃包括LT＆;＆limits.h中GT;＃包括mpi.h＃定义FULL_PROGRAM 1结构CD {    INT int_ID;
    双dbl_ID;
};INT主（INT ARGC，CHAR *的argv []）{    INT numprocs，身份识别码，错误code;＃如果FULL_PROGRAM
    结构CD * MYDATA的= NULL; //每个进程有助于数据的阵列，由'结构CD元素
    结构CD * ALLDATA = NULL; //根将动态地分配这个数组从过程其余存储所有数据
    为int * p_lens = NULL，* p_disp = NULL; // p_lens存储在每个进程的数组元素数，p_disp存储在字节位移
    INT MPI_CD_size; //存储被定义使用'结构CD元素通信业务，允许MPI_Datatype的大小    INT mylen，total_len = 0; // mylen应每个过程的长度'阵列
                                        // MAXLEN是允许的最大数组长度
                                        // total_len将mylen的所有进程的总和    // ============涉及在运行时定义新MPI_Datatype变量============================ ========================
    结构的CD sampleCD = {.int_ID = 0，.dbl_ID = 0.0};
    INT blocklengths [2]; //这个描述如何相同的数据类型很多块将在新MPI_Datatype
    MPI_Aint偏移[2]; //这个存储的偏移量，以字节为单位（位），块从数据类型的'开始'
    MPI_Datatype block_types [2]; //这个店里面内置的数据类型的块由
    MPI_Datatype myMPI_CD; //只是新数据类型的名称
    MPI_Aint myStruct_address，int_ID_address，dbl_ID_address，int_offset，dbl_offset; //有用的占位符填充上述阵列
    // ===========================================================================================================================
＃万一
    //初始化=================== MPI功能========================== ==
    MPI_INIT（安培; ARGC，＆安培; argv的）;
    MPI_Comm_size（MPI_COMM_WORLD，＆安培; numprocs）;
    MPI_Comm_rank（MPI_COMM_WORLD，＆安培;身份识别码）;
    // ================================================ ===============================
＃如果FULL_PROGRAM
    // ==================这部分实际上正式定义的数据类型MPI ====================== =========================
    MPI_Get_address（安培; sampleCD，＆安培; myStruct_address）; //开始结构CD点
    MPI_Get_address（安培; sampleCD.int_ID，＆安培; int_ID_address）; // CD中的第一项的起点
    MPI_Get_address（安培; sampleCD.dbl_ID，＆安培; dbl_ID_address）; //在CD第二个输入的起点
    int_offset = int_ID_address  -  myStruct_address; //从第一个开始偏移启动光盘
    dbl_offset = dbl_ID_address  -  myStruct_address; //从第二个开始偏移启动光盘    blocklengths [0] = 1; blocklengths [1] = 1; //数组告诉它相同的数据类型有多少块，以及条目的每个块中的数
    //这说，有相同数据类型的两个块，并且两个块具有在其中只有一个变量    偏移[0] = int_offset;偏移[1] = dbl_offset; //第一块开始于int_offset，第二块开始于dbl_offset（从'myData_address'    block_types [0] = MPI_INT; block_types [1] = MPI_DOUBLE; //第一个块包含MPI_INT，第二个包含MPI_DOUBLE    MPI_Type_create_struct（2 blocklengths，偏移，block_types，＆安培; myMPI_CD）; //这个使用上述数组定义MPI_Datatype ...的MPI-2功能    MPI_Type_commit（安培; myMPI_CD）; //这是界定/预订数据类型中的最后一步
    // ========================================================================================================================    mylen =本身份识别码* 2; //每个进程被告知它的阵列应该有多长......我用来定义随机但只是使事情混乱    p_lens =（INT *）释放calloc（（为size_t）numprocs，sizeof的（INT））; //对于元件（p_lens）和偏移距离的recv缓冲器的起始数（d_disp）分配内存
    p_disp =（INT *）释放calloc（（为size_t）numprocs，sizeof的（INT））;    MYDATA的=（结构CD *）释放calloc（（为size_t）mylen，sizeof的（结构CD））; //为每个进程的数组分配内存
    //如果mylen == 0，则返回的唯一指针堆    如果{MPI_Abort（MPI_COMM_WORLD，1）（p_lens！）;出口（EXIT_FAILURE）; }
    如果{MPI_Abort（MPI_COMM_WORLD，1）（p_disp！）;出口（EXIT_FAILURE）; }
    如果（myData的！）{MPI_Abort（MPI_COMM_WORLD，1）;出口（EXIT_FAILURE）; }
    对于（双TEMP = 0.0;温度＆LT; 1e6个电子++ TEMP）温度+ = EXP（-10.0）;
    MPI_Barrier（MPI_COMM_WORLD）; //纯粹是为了保持举办的输出给一个时间延迟    对于（INT K = 0; K＆LT; numprocs ++ K）{        如果（身份识别码== K）{            //的printf（\\ t ID％d的％d个条目：{，身份识别码，mylen）;            的for（int i = 0; I＆LT; mylen ++我）{                myData的[I] =（结构光盘）{.int_ID =本身份识别码*第（i + 1），.dbl_ID =本身份识别码*第（i + 1）}; //填充用简单的模式数据元素
                //的printf（％d个：（％D，％LG），我，myData的[I] .int_ID，myData的[I] .dbl_ID）;
            }
            //的printf（} \\ n）;
        }
    }    对于（双TEMP = 0.0;温度＆LT; 1e6个电子++ TEMP）温度+ = EXP（-10.0）;
    MPI_Barrier（MPI_COMM_WORLD）; //纯粹是为了保持举办的输出给一个时间延迟    MPI_Gather（安培; mylen，1，MPI_INT，p_lens，1，MPI_INT，0，MPI_COMM_WORLD）; //每一个进程将根他们将发送向量的长度1＃如果
    MPI_Type_size（myMPI_CD，＆安培; MPI_CD_size）; //获取MPI_Datatype为p_disp大小
＃其他
    MPI_CD_size = sizeof的（结构CD）; //使用这个不会改变的东西太多了...
＃万一    对于（INT J = 0; J＆LT; numprocs ++ j）条{        total_len + = p_lens [J]。        如果（J == 0）{p_disp [J] = 0; }
        其他{p_disp [J] = p_disp [J-1] + p_lens [J] * MPI_CD_size; }
    }    如果（身份识别码== 0）{        ALLDATA =（结构CD *）释放calloc（（为size_t）total_len，sizeof的（结构CD））; //分配数组
        如果（ALLDATA！）{MPI_Abort（MPI_COMM_WORLD，1）;出口（EXIT_FAILURE）; }
    }    MPI_Gatherv（MYDATA的，mylen，myMPI_CD，ALLDATA，p_lens，p_disp，myMPI_CD，0，MPI_COMM_WORLD）; //每个阵列发送根处理他们的阵列，其被存储在'ALLDATA'    // ==============================输出证实：通讯成功============ =============================
    如果（身份识别码== 0）{        的for（int i = 0; I＆LT; numprocs ++我）{
            的printf（从％d \\ n \\ tElements上MASTER是：{，我）;
            对于（INT K = 0; K＆LT; p_lens [I]; ++ K）{printf的（％d个：（％D，％LG），K，（ALLDATA + p_disp [I] + K） -  GT; int_ID （ALLDATA + p_disp [I] + K） -  GT; dbl_ID）; }            如果（p_lens [I] == 0）printf的（无）;
            的printf（} \\ n）;
        }
        的printf（\\ n）; //每个数据元素应该出现两个相同的号码，该进程ID递增计数
    }
    // ==========================================================================================================    如果（p_lens）{免费（p_lens）; p_lens = NULL; } //添加这并没有摆脱MPI_Finalize赛格过错
    如果（p_disp）{免费（p_disp）; p_disp = NULL; }
    如果（MYDATA的）{免费（MYDATA的）; myData的= NULL; }
    如果（ALLDATA）{免费（ALLDATA）; ALLDATA = NULL; } // if语句，确保不是这个指针分配内存的进程没有任何自由    对于（双TEMP = 0.0;温度＆LT; 1e6个电子++ TEMP）温度+ = EXP（-10.0）;
    MPI_Barrier（MPI_COMM_WORLD）; //纯粹是为了保持举办的输出给一个时间延迟
    的printf（ID％D：我已经走到了尽头...... MPI_Type_free前\\ n！，身份识别码）;    // ====================== CLEAN UP ======================== ================================================== ======
    ERROR code = MPI_Type_free（安培; myMPI_CD）; //这个释放的数据类型...不总是必要的，但一个好习惯    对于（双TEMP = 0.0;温度＆LT; 1e6个电子++ TEMP）温度+ = EXP（-10.0）;
    MPI_Barrier（MPI_COMM_WORLD）; //纯粹是为了保持举办的输出给一个时间延迟    如果（ERROR code = MPI_SUCCESS！）{printf的（ID％d个... MPI_Type_free没有成功\\ n，身份识别码）; MPI_Abort（MPI_COMM_WORLD，911）;出口（EXIT_FAILURE）; }
    其他{printf的（ID％d个... MPI_Type_free是成功的，在进入MPI_Finalize ... \\ N，身份识别码）; }
＃万一
    ERROR code = MPI_Finalize（）;    对于（双TEMP = 0.0;温度＆LT; 1E7 ++ TEMP）温度+ = EXP（-10.0）; //没有MPI_Barrier后MPI_Finalize！    如果（ERROR code = MPI_SUCCESS！）{printf的（ID％d个... MPI_Finalize没有成功\\ n，身份识别码）; MPI_Abort（MPI_COMM_WORLD，911）;出口（EXIT_FAILURE）; }
    其他{printf的（ID％d个... MPI_Finalize成功\\ n，身份识别码）; }    返回EXIT_SUCCESS;
}

解决方案

日k外环是假的，但不是技术上的错误的 - 它只是无用

真正的问题是，你要位移MPI_GATHERV是错误的。如果通过的valgrind运行，你会看到这样的事情：

  == == 28749大小为2写入无效
== == 28749在0x4A086F4：的memcpy（mc_replace_strmem.c：838）
== == 28749通过0x4C69614：unpack_ predefined_data（datatype_unpack.h：41）
== == 28749通过0x4C6B336：ompi_generic_simple_unpack（datatype_unpack.c：418）
== == 28749通过0x4C7288F：ompi_convertor_unpack（convertor.c：314）
== == 28749通过0x8B295C7：mca_pml_ob1_recv_frag_callback_match（pml_ob1_recvfrag.c：216）
== == 28749通过0x935723C：mca_btl_sm_component_progress（btl_sm_component.c：426）
== == 28749通过0x51D4F79：opal_progress（opal_progress.c：207）
== == 28749通过0x8B225CA：opal_condition_wait（condition.h：99）
== == 28749通过0x8B22718：ompi_request_wait_completion（request.h：375）
== == 28749通过0x8B231E1：mca_pml_ob1_recv（pml_ob1_irecv.c：104）
== == 28749通过0x955E7A7：mca_coll_basic_gatherv_intra（coll_basic_gatherv.c：85）
== == 28749通过0x9F7CBFA：mca_coll_sync_gatherv（coll_sync_gatherv.c：46）
== == 28749地址0x7b1d630不stack'd，malloc分配或（最近）free'd

这表明MPI_GATHERV在某种程度上给予不良信息。

（还有一些来自开放MPI内的libltdl其他Valgrind的警告，这是不幸的是不可避免的 - 它在的libltdl一个bug，另一个来自PLPA，这也是不幸的是不可避免的，因为它是有意这样做[理由是不有趣的在这里讨论]）

在您的位移计算看，我看

  total_len + = p_lens [J]。    如果（J == 0）{
        p_disp [J] = 0;
    }其他{
        p_disp [J] = p_disp [J  -  1] + p_lens [J] * MPI_CD_size;
    }

但MPI聚集位移数据类型，而不是字节为单位。所以，它真的应该是：

  p_disp [J] = total_len;
total_len + = p_lens [J]。

此更改所做的MPI_GATHERV Valgrind的警告离开我。

I am writing a simple code to learn how to define an MPI_Datatype and use it in conjunction with MPI_Gatherv. I wanted to make sure I could combine variable length, dynamically allocated arrays of structured data on a process, which seems to be working fine, up until my call to MPI_Finalize(). I have confirmed that this is where the problem starts to manifest itself by using print statements and the Eclipse PTP debugger (backend is gdb-mi). My main question is, how can I get rid of the segmentation fault?

The segfault does not occur every time I run the code. For instance, it hasn't happened for 2 or 3 processes, but tends to happen regularly when I run with about 4 or more processes.

Also, when I run this code with valgrind, the segmentation fault does not occur. However, I do get error messages from valgrind, though the output is difficult for me to understand when I use MPI functions, even with a large number of targeted suppressions. I am also concerned that if I use more suppressions, I will silence a useful error message.

I compile the normal code using these flags, so I am using the C99 standard in both cases: -ansi -pedantic -Wall -O2 -march=barcelona -fomit-frame-pointer -std=c99and the debugged code with: -ansi -pedantic -std=c99 -Wall -g

Both use the gcc 4.4 mpicc compiler, and are run on a cluster using Red Hat Linux with Open MPI v1.4.5. Please let me know if I have left out other important bits of information. Here is the code, and thanks in advance:

//#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
//#include <limits.h>

#include "mpi.h"

#define FULL_PROGRAM        1

struct CD{

    int int_ID;
    double dbl_ID;
};

int main(int argc, char *argv[]) {

    int numprocs, myid, ERRORCODE;

#if FULL_PROGRAM
    struct CD *myData=NULL;             //Each process contributes an array of data, comprised of 'struct CD' elements
    struct CD *allData=NULL;            //root will dynamically allocate this array to store all the data from rest of the processes
    int *p_lens=NULL, *p_disp=NULL;     //p_lens stores the number of elements in each process' array, p_disp stores the displacements in bytes
    int MPI_CD_size;                    //stores the size of the MPI_Datatype that is defined to allow communication operations using 'struct CD' elements

    int mylen, total_len=0;             //mylen should be the length of each process' array
                                        //MAXlen is the maximum allowable array length
                                        //total_len will be the sum of mylen across all processes

    // ============ variables related to defining new MPI_Datatype at runtime ====================================================
    struct CD sampleCD = {.int_ID=0, .dbl_ID=0.0};
    int blocklengths[2];                //this describes how many blocks of identical data types will be in the new MPI_Datatype
    MPI_Aint offsets[2];                //this stores the offsets, in bytes(bits?), of the blocks from the 'start' of the datatype
    MPI_Datatype block_types[2];        //this stores which built-in data types the blocks are comprised of
    MPI_Datatype  myMPI_CD;             //just the name of the new datatype
    MPI_Aint myStruct_address, int_ID_address, dbl_ID_address, int_offset, dbl_offset;  //useful place holders for filling the arrays above
    // ===========================================================================================================================
#endif
    // =================== Initializing MPI functionality ============================
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    // ===============================================================================
#if FULL_PROGRAM
    // ================== This part actually formally defines the MPI datatype ===============================================
    MPI_Get_address(&sampleCD, &myStruct_address);          //starting point of struct CD
    MPI_Get_address(&sampleCD.int_ID, &int_ID_address);     //starting point of first entry in CD
    MPI_Get_address(&sampleCD.dbl_ID, &dbl_ID_address);     //starting point of second entry in CD
    int_offset = int_ID_address - myStruct_address;         //offset from start of first to start of CD
    dbl_offset = dbl_ID_address - myStruct_address;         //offset from start of second to start of CD

    blocklengths[0]=1;  blocklengths[1]=1;                  //array telling it how many blocks of identical data types there are, and the number of entries in each block
    //This says there are two blocks of identical data-types, and both blocks have only one variable in them

    offsets[0]=int_offset;  offsets[1]=dbl_offset;          //the first block starts at int_offset, the second block starts at dbl_offset (from 'myData_address'

    block_types[0]=MPI_INT; block_types[1]=MPI_DOUBLE;      //the first block contains MPI_INT, the second contains MPI_DOUBLE

    MPI_Type_create_struct(2, blocklengths, offsets, block_types, &myMPI_CD);       //this uses the above arrays to define the MPI_Datatype...an MPI-2 function

    MPI_Type_commit(&myMPI_CD);     //this is the final step to defining/reserving the data type
    // ========================================================================================================================

    mylen   = myid*2;       //each process is told how long its array should be...I used to define that randomly but that just makes things messier

    p_lens  = (int*)        calloc((size_t)numprocs,    sizeof(int));       //allocate memory for the number of elements (p_lens) and offsets from the start of the recv buffer(d_disp)
    p_disp  = (int*)        calloc((size_t)numprocs,    sizeof(int));

    myData  = (struct CD*)  calloc((size_t)mylen,       sizeof(struct CD));         //allocate memory for each process' array
    //if mylen==0, 'a unique pointer to the heap is returned'

    if(!p_lens) {   MPI_Abort(MPI_COMM_WORLD, 1); exit(EXIT_FAILURE);   }
    if(!p_disp) {   MPI_Abort(MPI_COMM_WORLD, 1); exit(EXIT_FAILURE);   }
    if(!myData) {   MPI_Abort(MPI_COMM_WORLD, 1); exit(EXIT_FAILURE);   }


    for(double temp=0.0;temp<1e6;++temp) temp += exp(-10.0);
    MPI_Barrier(MPI_COMM_WORLD);                                //purely for keeping the output organized by give a delay in time

    for (int k=0; k<numprocs; ++k) {

        if(myid==k) {

            //printf("\t ID %d has %d entries: { ", myid, mylen);

            for(int i=0; i<mylen; ++i) {

                myData[i]= (struct CD) {.int_ID=myid*(i+1), .dbl_ID=myid*(i+1)};            //fills data elements with simple pattern
                //printf("%d: (%d,%lg) ", i, myData[i].int_ID, myData[i].dbl_ID);
            }
            //printf("}\n");
        }
    }

    for(double temp=0.0;temp<1e6;++temp) temp += exp(-10.0);
    MPI_Barrier(MPI_COMM_WORLD);                            //purely for keeping the output organized by give a delay in time

    MPI_Gather(&mylen,  1, MPI_INT, p_lens, 1, MPI_INT, 0, MPI_COMM_WORLD);     //Each process sends root the length of the vector they'll be sending

#if 1
    MPI_Type_size(myMPI_CD, &MPI_CD_size);          //gets the size of the MPI_Datatype for p_disp
#else
    MPI_CD_size = sizeof(struct CD);                //using this doesn't change things too much...
#endif

    for(int j=0;j<numprocs;++j) {

        total_len += p_lens[j];

        if (j==0)   {   p_disp[j] = 0;                                      }
        else        {   p_disp[j] = p_disp[j-1] + p_lens[j]*MPI_CD_size;    }
    }

    if (myid==0)    {

        allData = (struct CD*)  calloc((size_t)total_len,   sizeof(struct CD));     //allocate array
        if(!allData)    {   MPI_Abort(MPI_COMM_WORLD, 1); exit(EXIT_FAILURE);   }
    }

    MPI_Gatherv(myData, mylen, myMPI_CD, allData, p_lens, p_disp, myMPI_CD, 0, MPI_COMM_WORLD); //each array sends root process their array, which is stored in 'allData'

    // ============================== OUTPUT CONFIRMING THAT COMMUNICATIONS WERE SUCCESSFUL=========================================
    if(myid==0) {

        for(int i=0;i<numprocs;++i) {
            printf("\n\tElements from %d on MASTER are: { ",i);
            for(int k=0;k<p_lens[i];++k)    {   printf("%d: (%d,%lg) ", k, (allData+p_disp[i]+k)->int_ID, (allData+p_disp[i]+k)->dbl_ID);   }

            if(p_lens[i]==0) printf("NOTHING ");
            printf("}\n");
        }
        printf("\n");       //each data element should appear as two identical numbers, counting upward by the process ID
    }
    // ==========================================================================================================

    if (p_lens) {   free(p_lens);   p_lens=NULL;    }       //adding this in didn't get rid of the MPI_Finalize seg-fault
    if (p_disp) {   free(p_disp);   p_disp=NULL;    }
    if (myData) {   free(myData);   myData=NULL;    }
    if (allData){   free(allData);  allData=NULL;   }       //the if statement ensures that processes not allocating memory for this pointer don't free anything

    for(double temp=0.0;temp<1e6;++temp) temp += exp(-10.0);
    MPI_Barrier(MPI_COMM_WORLD);                            //purely for keeping the output organized by give a delay in time
    printf("ID %d: I have reached the end...before MPI_Type_free!\n", myid);

    // ====================== CLEAN UP ================================================================================
    ERRORCODE = MPI_Type_free(&myMPI_CD);           //this frees the data type...not always necessary, but a good practice

    for(double temp=0.0;temp<1e6;++temp) temp += exp(-10.0);
    MPI_Barrier(MPI_COMM_WORLD);                                //purely for keeping the output organized by give a delay in time

    if(ERRORCODE!=MPI_SUCCESS)  {   printf("ID %d...MPI_Type_free was not successful\n", myid); MPI_Abort(MPI_COMM_WORLD, 911); exit(EXIT_FAILURE); }
    else                        {   printf("ID %d...MPI_Type_free was successful, entering MPI_Finalize...\n", myid);       }
#endif
    ERRORCODE=MPI_Finalize();

    for(double temp=0.0;temp<1e7;++temp) temp += exp(-10.0);        //NO MPI_Barrier AFTER MPI_Finalize!

    if(ERRORCODE!=MPI_SUCCESS)  {   printf("ID %d...MPI_Finalize was not successful\n", myid);  MPI_Abort(MPI_COMM_WORLD, 911); exit(EXIT_FAILURE); }
    else                        {   printf("ID %d...MPI_Finalize was successful\n", myid);      }

    return EXIT_SUCCESS;
}

解决方案

The outer loop on k is bogus, but is not technically wrong -- it's just useless.

The real issue is that your displacements to MPI_GATHERV are wrong. If you run through valgrind, you'll see something like this:

==28749== Invalid write of size 2
==28749==    at 0x4A086F4: memcpy (mc_replace_strmem.c:838)
==28749==    by 0x4C69614: unpack_predefined_data (datatype_unpack.h:41)
==28749==    by 0x4C6B336: ompi_generic_simple_unpack (datatype_unpack.c:418)
==28749==    by 0x4C7288F: ompi_convertor_unpack (convertor.c:314)
==28749==    by 0x8B295C7: mca_pml_ob1_recv_frag_callback_match (pml_ob1_recvfrag.c:216)
==28749==    by 0x935723C: mca_btl_sm_component_progress (btl_sm_component.c:426)
==28749==    by 0x51D4F79: opal_progress (opal_progress.c:207)
==28749==    by 0x8B225CA: opal_condition_wait (condition.h:99)
==28749==    by 0x8B22718: ompi_request_wait_completion (request.h:375)
==28749==    by 0x8B231E1: mca_pml_ob1_recv (pml_ob1_irecv.c:104)
==28749==    by 0x955E7A7: mca_coll_basic_gatherv_intra (coll_basic_gatherv.c:85)
==28749==    by 0x9F7CBFA: mca_coll_sync_gatherv (coll_sync_gatherv.c:46)
==28749==  Address 0x7b1d630 is not stack'd, malloc'd or (recently) free'd

Indicating that MPI_GATHERV was given bad information somehow.

(there are other valgrind warnings that come from libltdl inside Open MPI which are unfortunately unavoidable -- it's a bug in libltdl, and another from PLPA, which is also unfortunately unavoidable because it's intentionally doing that [for reasons that aren't interesting to discuss here])

Looking at your displacements computation, I see

    total_len += p_lens[j];

    if (j == 0) {
        p_disp[j] = 0;
    } else {
        p_disp[j] = p_disp[j - 1] + p_lens[j] * MPI_CD_size;
    }

But MPI gather displacements are in units of datatypes, not bytes. So it really should be:

p_disp[j] = total_len;
total_len += p_lens[j];

Making this change made the MPI_GATHERV valgrind warning go away for me.

这篇关于C，开放MPI：从通话分割过错MPI_Finalize（）。段错误并不总是发生，特别是随着工艺低的数字的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！