我使用的是CFD代码,它基于Opnempi。当我只使用一个内核运行它时,就没有问题了。但是当我使用更多的颜色运行它时,我会出现这样的错误:
[桌面-7D2F3AN:03839]处理接收信号
[桌面-7D2F3AN:03839]信号:分段故障(11)
[桌面-7D2F3AN:03839]信号代码:(128)
[桌面-7D2F3AN:03839]地址失败:(无)
[桌面-7D2F3AN:03839][0]
/lib/x86 64 linux gnu/libpthread.so.0(+0x12890)[0x7f405efe2890]
[桌面-7D2F3AN:03839][1]
/lib/x86 64 linux gnu/libc.so.6(+0x18ec3c)[0x7f405ed5ec3c]
[桌面-7D2F3AN:03839][2]
/usr/local/lib/libmpi.so.0(泵组+0x196)[0x7f405f462176]
[桌面-7D2F3AN:03839][3]
/usr/local/lib/openmpi/mca pml ob1.so(+0x10851)[0x7f405b780851]
[桌面-7D2F3AN:03839][4]
/usr/local/lib/openmpi/mca pml ob1.so(+0x53ee)[0x7f405b7753ee]
[桌面-7D2F3AN:03839][5]
/usr/local/lib/openmpi/mca colu已优化。so(+0xb2ff)[0x7f4059e2b2ff]
[桌面-7D2F3AN:03839][6]
/usr/local/lib/openmpi/mca colu已优化。so(+0xbaf1)[0x7f4059e2baf1]
[桌面-7D2F3AN:03839][7]
/usr/local/lib/openmpi/mca colu已优化。so(+0x20ab)[0x7f4059e220ab]
[桌面-7D2F3AN:03839][8]
/usr/local/lib/openmpi/mca colu sync.so(+0x1377)[0x7f405a041377]
[桌面-7D2F3AN:03839][9]
/usr/local/lib/lib MPI.so.0(MPI-u Bcast+0x11d)[0x7f405f46974d]
[桌面-7D2F3AN:03839][10]。/cgles(com远程参数+0x4b)
[0x7f4060236972]
[桌面-7D2F3AN:03839][11]。/cgles(主屏幕+0x377)[0x7f4060233377]
[桌面-7D2F3AN:03839][12]
/lib/x86 64 linux gnu/libc.so.6(libc start main+0xe7)
[0x7f405ebf1b97]
[桌面-7D2F3AN:03839][13]。/cgles(开始+0x2a)[0x7f406022254a]
[桌面-7D2F3AN:03839]错误消息结束
mpirun注意到节点上PID为3839的进程列为0
桌面-7D2F3AN在信号11(分段故障)时退出。
我不知道为什么。代码太长。主要部分如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include "param.h"
#include "block.h"
#include "q.h"
#include "map.h"
#include "comms.h"
#include "util.h"
#include "dat.h"
#include "timing.h"
#include "sgm.h"
#include "error.h"
#include "init.h"
#include "mom.h"
#include "inst.h"
#include "pcg.h"
#include "bicgstab.h"
#include "ibm.h"
#include "hist.h"
#include "pcg_ext.h"
#include "dibm.h"
/* using BLAS library for better performance */
#ifdef BLAS
#include <essl.h>
#endif
/* local prototypes */
void cmdopts(int *, char ***);
void mybasename(char *, char *);
void glob_output(void);
void glob_tstep(void);
void getstep(char *fname, double *Prtime, int *Pitime);
void putstep(char *fname, double *Prtime, int *Pitime);
/* external prototypes */
void slice_probe(void);
void print_solution(char* , int);
void print_vert(char*, int);
void print_wave(char*, int);
void print_moments(char*, int);
extern void InitializeIBPoint();
/*void glob_snapshot(char*);*/
extern void dibm_interploate_shear_velocity(void);
/* global, used by mgp.c, pcg.c and map.c */
/* global, used by mgp.c, pcg.c and map.c */
int mypid = -1; /* my process id number */
int npid = 1; /* total process number */
int halt_flg = 0; /* halt t-steps, 1=stop, 0=cont */
double wck_t_start, wck_t_end; /* wall clock timing */
double cpu_t_start, cpu_t_end; /* cpu clock timing */
int time_limit = 999999999; /* wall clock limit (seconds) */
double Y_pro_mu=0; //Mu in Y-code;
double Y_pro_Laimuda=0;//Laimuda in Y-code
//char Y_y3dfile[256]; //*.y3d filename in Y-code
double Y_adheforcefactor=1.0;//force factor applied to ib points on Y solid, used in Yw.c-F2y()
double Y_shear_force_factor =1.0;
double Y_shear_vel = 0.0;
int Y_init_entity_num; //cell number at initilization of Y
//#Additional Misc by XuDong
int stat_y_save_dn; // 1000 #y solid saving interval
int stat_plane_save_dn;// 1000 #plane saving interval
int stat_glob_fluid_save_dn;// 10000000 #global fluid saving interval
int stat_restart_save_dn; // 1000 #Restart file saving interval
int stat_gtk_draw_dn; // 1 #gtkdraw interval when GTK defined
int stat_gtk_save_dn; // 100 #gtkdraw picture saving interval when GTK defined
int stat_y_typic_save_dn;
double case_parameter_1;//undefined reserved case parameter
double case_parameter_2;
double case_parameter_3;
double case_parameter_4;
/* main: driver routine */
int main(int argc, char **argv)
{
int bid, ibid;
/* switches */
int add_seed_flg = 0, probe_flg = 0, putxd_2x_flg = 0, reset_stats_flg = 0;
/* Start up any other nodes. The cmd line options relating to the
* parallel interface are parsed and pruned here. */
mypid = com_start(&argc, &argv, 1);
npid = com_nnodes();
wck_t_start = WCKseconds();
cpu_t_start = CPUseconds();
/* Parse the remaining options [first node only] */
if (mypid == 0)
cmdopts(&argc, &argv);
/* Buffering mode for stdout; needed on T3D */
fflush(stdout);
setvbuf(stdout, (char *) 0, _IOLBF, 0); /* line buffering */
/* Read configuration data on ONE node */
if (mypid == 0)
{
/* get the map data and mesh points */
map_getfn(fnames.map, fnames.msh);
map_print(BID_NULL);
if (map_checkt() || map_checku())
exit(2);
/* debugging */
/* map_putfn("temp.map"); */
/* msh_putfn("temp.msh"); */
/* read the local data file and build lookup table */
dat_read(fnames.dat);
dat_print();
/* Modify parameters using info from *.dat file */
dat_getdouble(&viscm, "viscm");
dat_getdouble(&fbody_x, "fbody_x");
dat_getdouble(&fbody_y, "fbody_y");
dat_getdouble(&fbody_z, "fbody_z");
dat_getint(&time_limit, "time_limit");
//#Additional Misc by XuDong
dat_getdouble(&crk_beta,"crk_beta");// #=0--explicit, ==1 fully implicit ==1/2 crank-nicolson
dat_getdouble(&ibm_relax,"ibm_relax");// #=0.5---normal for direct forcing
dat_getint(&stat_y_save_dn,"stat_y_save_dn");// 1000 #y solid saving interval
dat_getint(&stat_plane_save_dn,"stat_plane_save_dn");// 1000 #plane saving interval
dat_getint(&stat_glob_fluid_save_dn,"stat_glob_fluid_save_dn");// 10000000 #global fluid saving interval
dat_getint(&stat_restart_save_dn,"stat_restart_save_dn");// 1000 #Restart file saving interval
dat_getint(&stat_gtk_draw_dn,"stat_gtk_draw_dn");// 1 #gtkdraw interval when GTK defined
dat_getint(&stat_gtk_save_dn,"stat_gtk_save_dn");// 100 #gtkdraw picture saving interval when GTK defined
dat_getint(&stat_y_typic_save_dn,"stat_y_typic_save_dn");// 100 #gtkdraw picture saving interval when GTK defined
dat_getdouble(&Y_adheforcefactor,"Y_adheforcefactor");// 1 #gtkdraw interval when GTK defined
dat_getdouble(&Y_shear_force_factor,"Y_shear_force_factor");// 100 #gtkdraw picture saving interval when GTK defined
dat_getdouble(&Y_shear_vel,"Y_shear_vel");// 100 #gtkdraw picture saving interval when GTK defined
dat_getint(&Y_init_entity_num,"Y_init_entity_num");// //cell number at initilization of Y
dat_getdouble(&case_parameter_1,"case_parameter_1"); //undefined reserved case parameter
dat_getdouble(&case_parameter_2,"case_parameter_2"); //undefined reserved case parameter
dat_getdouble(&case_parameter_3,"case_parameter_3"); //undefined reserved case parameter
dat_getdouble(&case_parameter_4,"case_parameter_4"); //undefined reserved case parameter
/* set current time step parameters */
dat_getint(&ntime, "ntime");
dat_getdouble(&dt, "dt");
if (!flg_init)
getstep(fnames.xdi, &rtime_current, &itime);
itime_first = itime;
itime_last = itime_first + ntime;
rtime_first = rtime_current;
rtime_last = rtime_current + dt * ntime;
/* print fluid parameters */
prparam();
}
/* Distibute to other nodes */
com_dist_param(); /* distribute file names */
com_dist_map(); /* distribute map */
com_dist_data(); /* distribute table */
我找到了发生错误的地方,它在com dist param()中。
void com_dist_param(void)
{
/* This is all there is to it, curtesy of derived datatypes */
MPI_Bcast(&fnames, 1, fnames_type, 0, MPI_COMM_WORLD);
/* params starts with address of fbody_x */
MPI_Bcast(&fbody_x, 1, params_type, 0, MPI_COMM_WORLD);
}
当它进入MPI-u Bcast时出错。标题:
typedef struct
{
char xdi[MAXSTR]; /* xd in */
char xdo[MAXSTR]; /* xd out */
char dat[MAXSTR]; /* data */
char map[MAXSTR]; /* domain map */
char msh[MAXSTR]; /* mesh points */
char y3d[MAXSTR]; /* Y3D input */
char log[MAXSTR]; /* logging */
char mom[MAXSTR]; /* stats moments */
}
Fnames;
它将fnames类型定义为MPI数据类型:
MPI_Datatype
make_fnames_type(void)
{
MPI_Datatype fnames_type;
MPI_Datatype type[8];
int blocklen[8];
MPI_Aint disp[8];
int base, i;
/* define it */
MPI_Address(&fnames.xdi, disp);
MPI_Address(&fnames.xdo, disp + 1);
MPI_Address(&fnames.dat, disp + 2);
MPI_Address(&fnames.map, disp + 3);
MPI_Address(&fnames.msh, disp + 4);
MPI_Address(&fnames.y3d, disp + 5);
MPI_Address(&fnames.log, disp + 6);
MPI_Address(&fnames.mom, disp + 7);
base = disp[0];
for (i = 0; i < 8; i++) {
type[i] = MPI_CHAR;
blocklen[i] = MAXSTR;
disp[i] -= base;
/*
printf("disp[i] = %d blocklen[i] = %d\n", disp[i], blocklen[i]);
*/
}
/* create it */
MPI_Type_struct(8, blocklen, disp, type, &fnames_type);
MPI_Type_commit(&fnames_type);
return (fnames_type);
}
有人能帮我吗?谢谢!
最佳答案
假设与大多数MPI用户一样,您正在64位计算机上运行代码:
MPI必须能够包含指针,因此大小为64位。在函数make_fnames_type()
中,要从结构的各个字段的地址中减去结构的基址,需要将基址存储到一个int
变量中,该变量名为base
。但是这个int变量是32位的。您可能正在破坏指针值。
结构的基址是64位的数量。您应该只将它存储到另一个64位类型中,例如unsigned long int或uint64。语句base = disp[0];
可能是一个变窄转换,并且在任何情况下都不可移植。您应该检查所涉及的sizeof值。
边注:在C++中,与C不同,你不需要预先声明所有函数变量。您可以根据需要声明变量。例如,在定义其值之前,没有理由声明int variablebase
10个源代码行。所以你可以把int base = disp[0];
都放在一个地方,这样缩小范围的事故就更容易被发现了。更妙的是,你可以直接写:auto base = disp[0];
然后就不会发生变窄。
一种可能是这样改变循环:
long int base = disp[0];
for (i = 0; i < 8; i++) {
type[i] = MPI_CHAR;
blocklen[i] = MAXSTR;
disp[i] -= base;
/* changed %d into %ld below */
printf("disp[i] = %ld blocklen[i] = %d\n",
(long int)disp[i], blocklen[i]);
}
您还可以通过
MPI_Type_Size()
函数检查MPI自定义类型的内存大小是否符合您的期望。一般注释:C/C++中的所有MPI函数都返回一个整数错误代码。如果你跳过检查这些错误代码,后果自负。很可能在调用MPI Bcast()之前,某个MPI函数返回了错误代码。