我使用的是CFD代码,它基于Opnempi。当我只使用一个内核运行它时,就没有问题了。但是当我使用更多的颜色运行它时,我会出现这样的错误:
[桌面-7D2F3AN:03839]处理接收信号
[桌面-7D2F3AN:03839]信号:分段故障(11)
[桌面-7D2F3AN:03839]信号代码:(128)
[桌面-7D2F3AN:03839]地址失败:(无)
[桌面-7D2F3AN:03839][0]
/lib/x86 64 linux gnu/libpthread.so.0(+0x12890)[0x7f405efe2890]
[桌面-7D2F3AN:03839][1]
/lib/x86 64 linux gnu/libc.so.6(+0x18ec3c)[0x7f405ed5ec3c]
[桌面-7D2F3AN:03839][2]
/usr/local/lib/libmpi.so.0(泵组+0x196)[0x7f405f462176]
[桌面-7D2F3AN:03839][3]
/usr/local/lib/openmpi/mca pml ob1.so(+0x10851)[0x7f405b780851]
[桌面-7D2F3AN:03839][4]
/usr/local/lib/openmpi/mca pml ob1.so(+0x53ee)[0x7f405b7753ee]
[桌面-7D2F3AN:03839][5]
/usr/local/lib/openmpi/mca colu已优化。so(+0xb2ff)[0x7f4059e2b2ff]
[桌面-7D2F3AN:03839][6]
/usr/local/lib/openmpi/mca colu已优化。so(+0xbaf1)[0x7f4059e2baf1]
[桌面-7D2F3AN:03839][7]
/usr/local/lib/openmpi/mca colu已优化。so(+0x20ab)[0x7f4059e220ab]
[桌面-7D2F3AN:03839][8]
/usr/local/lib/openmpi/mca colu sync.so(+0x1377)[0x7f405a041377]
[桌面-7D2F3AN:03839][9]
/usr/local/lib/lib MPI.so.0(MPI-u Bcast+0x11d)[0x7f405f46974d]
[桌面-7D2F3AN:03839][10]。/cgles(com远程参数+0x4b)
[0x7f4060236972]
[桌面-7D2F3AN:03839][11]。/cgles(主屏幕+0x377)[0x7f4060233377]
[桌面-7D2F3AN:03839][12]
/lib/x86 64 linux gnu/libc.so.6(libc start main+0xe7)
[0x7f405ebf1b97]
[桌面-7D2F3AN:03839][13]。/cgles(开始+0x2a)[0x7f406022254a]
[桌面-7D2F3AN:03839]错误消息结束
mpirun注意到节点上PID为3839的进程列为0
桌面-7D2F3AN在信号11(分段故障)时退出。
我不知道为什么。代码太长。主要部分如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>

#include "param.h"
#include "block.h"
#include "q.h"
#include "map.h"
#include "comms.h"
#include "util.h"
#include "dat.h"
#include "timing.h"
#include "sgm.h"
#include "error.h"
#include "init.h"
#include "mom.h"
#include "inst.h"
#include "pcg.h"
#include "bicgstab.h"
#include "ibm.h"
#include "hist.h"
#include "pcg_ext.h"
#include "dibm.h"
/* using BLAS library for better performance */
#ifdef BLAS
#include <essl.h>
#endif

/* local prototypes */
void cmdopts(int *, char ***);
void mybasename(char *, char *);
void glob_output(void);
void glob_tstep(void);
void getstep(char *fname, double *Prtime, int *Pitime);
void putstep(char *fname, double *Prtime, int *Pitime);


/* external prototypes */
void slice_probe(void);
void print_solution(char* , int);
void print_vert(char*, int);
void print_wave(char*, int);
void print_moments(char*, int);

extern void InitializeIBPoint();
/*void glob_snapshot(char*);*/
extern void dibm_interploate_shear_velocity(void);


/* global, used by mgp.c, pcg.c and map.c */
/* global, used by mgp.c, pcg.c and map.c */
int mypid = -1;     /* my process id number */
int npid = 1;       /* total process number */
int halt_flg = 0;       /* halt t-steps, 1=stop, 0=cont */
double wck_t_start, wck_t_end;  /* wall clock timing */
double cpu_t_start, cpu_t_end;  /* cpu clock timing */
int time_limit = 999999999; /* wall clock limit (seconds) */
double Y_pro_mu=0; //Mu in Y-code;
double Y_pro_Laimuda=0;//Laimuda in Y-code
//char Y_y3dfile[256];  //*.y3d filename in Y-code
double Y_adheforcefactor=1.0;//force factor applied to ib points on Y solid, used in Yw.c-F2y()
double Y_shear_force_factor =1.0;
double Y_shear_vel = 0.0;
int Y_init_entity_num; //cell number at initilization of Y
//#Additional Misc by XuDong
int stat_y_save_dn; //  1000         #y solid saving interval
int stat_plane_save_dn;//   1000    #plane saving interval
int stat_glob_fluid_save_dn;//  10000000  #global fluid saving interval
int stat_restart_save_dn; //  1000   #Restart file saving interval
int stat_gtk_draw_dn; //  1          #gtkdraw interval when GTK defined
int stat_gtk_save_dn; //  100        #gtkdraw picture saving interval when GTK defined
int stat_y_typic_save_dn;
double case_parameter_1;//undefined reserved case parameter
double case_parameter_2;
double case_parameter_3;
double case_parameter_4;

/* main: driver routine */
int main(int argc, char **argv)
{
  int bid, ibid;
  /* switches */
  int add_seed_flg = 0, probe_flg = 0, putxd_2x_flg = 0, reset_stats_flg = 0;

  /* Start up any other nodes. The cmd line options relating to the
   * parallel interface are parsed and pruned here. */
  mypid = com_start(&argc, &argv, 1);
  npid  = com_nnodes();
  wck_t_start = WCKseconds();
  cpu_t_start = CPUseconds();

  /* Parse the remaining options [first node only] */
  if (mypid == 0)
    cmdopts(&argc, &argv);

  /* Buffering mode for stdout; needed on T3D */
  fflush(stdout);
  setvbuf(stdout, (char *) 0, _IOLBF, 0); /* line buffering */

  /* Read configuration data on ONE node */
  if (mypid == 0)
  {
    /* get the map data and mesh points */
    map_getfn(fnames.map, fnames.msh);
    map_print(BID_NULL);
    if (map_checkt() || map_checku())
      exit(2);

    /* debugging */
    /* map_putfn("temp.map"); */
    /* msh_putfn("temp.msh"); */

    /* read the local data file and build lookup table */
    dat_read(fnames.dat);
    dat_print();

    /* Modify parameters using info from *.dat file */
    dat_getdouble(&viscm, "viscm");
    dat_getdouble(&fbody_x, "fbody_x");
    dat_getdouble(&fbody_y, "fbody_y");
    dat_getdouble(&fbody_z, "fbody_z");
    dat_getint(&time_limit, "time_limit");

    //#Additional Misc by XuDong
    dat_getdouble(&crk_beta,"crk_beta");// #=0--explicit, ==1 fully implicit ==1/2 crank-nicolson
    dat_getdouble(&ibm_relax,"ibm_relax");// #=0.5---normal for direct forcing

    dat_getint(&stat_y_save_dn,"stat_y_save_dn");//  1000         #y solid saving interval
    dat_getint(&stat_plane_save_dn,"stat_plane_save_dn");//   1000    #plane saving interval
    dat_getint(&stat_glob_fluid_save_dn,"stat_glob_fluid_save_dn");//  10000000  #global fluid saving interval
    dat_getint(&stat_restart_save_dn,"stat_restart_save_dn");//  1000   #Restart file saving interval
    dat_getint(&stat_gtk_draw_dn,"stat_gtk_draw_dn");//  1          #gtkdraw interval when GTK defined
    dat_getint(&stat_gtk_save_dn,"stat_gtk_save_dn");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getint(&stat_y_typic_save_dn,"stat_y_typic_save_dn");//  100        #gtkdraw picture saving interval when GTK defined


    dat_getdouble(&Y_adheforcefactor,"Y_adheforcefactor");//  1          #gtkdraw interval when GTK defined
    dat_getdouble(&Y_shear_force_factor,"Y_shear_force_factor");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getdouble(&Y_shear_vel,"Y_shear_vel");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getint(&Y_init_entity_num,"Y_init_entity_num");//  //cell number at initilization of Y

    dat_getdouble(&case_parameter_1,"case_parameter_1"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_2,"case_parameter_2"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_3,"case_parameter_3"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_4,"case_parameter_4"); //undefined reserved case parameter
    /* set current time step parameters */
    dat_getint(&ntime, "ntime");
    dat_getdouble(&dt, "dt");
    if (!flg_init)
      getstep(fnames.xdi, &rtime_current, &itime);
    itime_first = itime;
    itime_last = itime_first + ntime;
    rtime_first = rtime_current;
    rtime_last = rtime_current + dt * ntime;

    /* print fluid parameters */
    prparam();
  }


    /* Distibute to other nodes */
    com_dist_param();   /* distribute file names */
    com_dist_map();   /* distribute map */
    com_dist_data();    /* distribute table */

我找到了发生错误的地方,它在com dist param()中。
        void com_dist_param(void)
        {
            /* This is all there is to it, curtesy of derived datatypes */
            MPI_Bcast(&fnames, 1, fnames_type, 0, MPI_COMM_WORLD);
            /* params starts with address of fbody_x */
            MPI_Bcast(&fbody_x, 1, params_type, 0, MPI_COMM_WORLD);
        }

当它进入MPI-u Bcast时出错。标题:
typedef struct
  {
    char xdi[MAXSTR];       /* xd in */
    char xdo[MAXSTR];       /* xd out */
    char dat[MAXSTR];       /* data */
    char map[MAXSTR];       /* domain map */
    char msh[MAXSTR];       /* mesh points */
    char y3d[MAXSTR];       /* Y3D input */
    char log[MAXSTR];       /* logging */
    char mom[MAXSTR];       /* stats moments */
  }
Fnames;

它将fnames类型定义为MPI数据类型:
MPI_Datatype
make_fnames_type(void)
{
    MPI_Datatype fnames_type;
    MPI_Datatype type[8];
    int blocklen[8];
    MPI_Aint disp[8];
    int base, i;
    /* define it */
    MPI_Address(&fnames.xdi, disp);
    MPI_Address(&fnames.xdo, disp + 1);
    MPI_Address(&fnames.dat, disp + 2);
    MPI_Address(&fnames.map, disp + 3);
    MPI_Address(&fnames.msh, disp + 4);
    MPI_Address(&fnames.y3d, disp + 5);
    MPI_Address(&fnames.log, disp + 6);
    MPI_Address(&fnames.mom, disp + 7);
    base = disp[0];
    for (i = 0; i < 8; i++) {
    type[i] = MPI_CHAR;
    blocklen[i] = MAXSTR;
    disp[i] -= base;
/*
   printf("disp[i] = %d blocklen[i] = %d\n", disp[i], blocklen[i]);
 */
    }

    /* create it */
    MPI_Type_struct(8, blocklen, disp, type, &fnames_type);
    MPI_Type_commit(&fnames_type);
    return (fnames_type);
}

有人能帮我吗?谢谢!

最佳答案

假设与大多数MPI用户一样,您正在64位计算机上运行代码:
MPI必须能够包含指针,因此大小为64位。在函数make_fnames_type()中,要从结构的各个字段的地址中减去结构的基址,需要将基址存储到一个int变量中,该变量名为base。但是这个int变量是32位的。您可能正在破坏指针值。
结构的基址是64位的数量。您应该只将它存储到另一个64位类型中,例如unsigned long int或uint64。语句base = disp[0];可能是一个变窄转换,并且在任何情况下都不可移植。您应该检查所涉及的sizeof值。
边注:在C++中,与C不同,你不需要预先声明所有函数变量。您可以根据需要声明变量。例如,在定义其值之前,没有理由声明int variablebase10个源代码行。所以你可以把int base = disp[0];都放在一个地方,这样缩小范围的事故就更容易被发现了。更妙的是,你可以直接写:auto base = disp[0];然后就不会发生变窄。
一种可能是这样改变循环:

    long int base = disp[0];
    for (i = 0; i < 8; i++) {
        type[i] = MPI_CHAR;
        blocklen[i] = MAXSTR;
        disp[i] -= base;
        /* changed %d into %ld below */
        printf("disp[i] = %ld  blocklen[i] = %d\n",
            (long int)disp[i], blocklen[i]);
    }

您还可以通过MPI_Type_Size()函数检查MPI自定义类型的内存大小是否符合您的期望。
一般注释:C/C++中的所有MPI函数都返回一个整数错误代码。如果你跳过检查这些错误代码,后果自负。很可能在调用MPI Bcast()之前,某个MPI函数返回了错误代码。

09-08 05:49