c++ - MPI_Neighbor_alltoallw()中正确的发送和接收顺序是什么？

我正在使用MPI_Neighbor_alltoallw()从相邻进程发送和接收数据。在我的应用程序中，我有幽灵单元格，应通过从相邻单元格复制单元格数据来对其进行更新。这是此过程的示意图:

蓝色单元格包含数据，紫色单元格是重影单元格，应使用相邻单元格数据的副本进行更新。

根据我从MPI standard中学到的知识，我写了我能做的最简单的例子。我还写了一个并行的vtk编写器，以便以后可视化数据。在以下代码中，我定义了用于发送和接收子数组的新MPI数据类型:

#include <mpi.h>

//VTK Library
#include <vtkXMLPStructuredGridWriter.h>
#include <vtkXMLStructuredGridWriter.h>
#include <vtkStructuredGrid.h>
#include <vtkSmartPointer.h>
#include <vtkFloatArray.h>
#include <vtkCellData.h>
#include <vtkProgrammableFilter.h>
#include <vtkInformation.h>
#include <vtkMPIController.h>

// To change the number of processes in each direction change nx, ny
const int nx{2};
const int ny{2};
const int Lx{100/nx};   // grid size without ghost cells
const int Ly{100/ny};
const int lx{Lx+4};     // grid size plus ghost cells
const int ly{Ly+4};

struct Args {
  vtkProgrammableFilter* pf;
  int local_extent[6];
};

// function to operate on the point attribute data
void execute (void* arg) {
  Args* args = reinterpret_cast<Args*>(arg);
  auto info = args->pf->GetOutputInformation(0);
  auto output_tmp = args->pf->GetOutput();  //WARNING this is a vtkDataObject*
  auto input_tmp  = args->pf->GetInput();   //WARNING this is a vtkDataObject*
  vtkStructuredGrid* output = dynamic_cast<vtkStructuredGrid*>(output_tmp);
  vtkStructuredGrid* input  = dynamic_cast<vtkStructuredGrid*>(input_tmp);
  output->ShallowCopy(input);
  output->SetExtent(args->local_extent);
}

void parallel_vtk_writer (double* cells, const char* name, int* coords, int* dim, vtkMPIController* contr) {
  int dims[2] = {lx+1, ly+1};
  int global_extent[6] = {0, dim[1]*lx, 0, dim[0]*ly, 0, 0};
  int local_extent[6] = {coords[1]*lx, coords[1]*lx + lx,
                        coords[0]*ly, coords[0]*ly + ly, 0, 0};

  int nranks = contr->GetNumberOfProcesses();
  int rank   = contr->GetLocalProcessId();

  auto points = vtkSmartPointer<vtkPoints>::New();
  points->Allocate((lx+1)*(ly+1));
  for (int j=0; j<ly+1; ++j)
    for (int i=0; i<lx+1; ++i)
      points->InsertPoint(i + j*(lx+1), i+coords[1]*lx, j+coords[0]*ly, 0);

  auto cell_value = vtkSmartPointer<vtkFloatArray>::New();
  cell_value->SetNumberOfComponents(1);
  cell_value->SetNumberOfTuples(lx*ly);
  cell_value->SetName ("cell value");

  for (int j=0; j<ly; ++j)
    for (int i=0; i<lx; ++i)
      cell_value->SetValue(i + j*lx, cells[i + j*lx]);

  auto pf = vtkSmartPointer<vtkProgrammableFilter>::New();

  Args args;
  args.pf = pf;
  for(int i=0; i<6; ++i) args.local_extent[i] = local_extent[i];

  pf->SetExecuteMethod(execute, &args);

  auto structuredGrid = vtkSmartPointer<vtkStructuredGrid>::New();
  structuredGrid->SetExtent(global_extent);
  pf->SetInputData(structuredGrid);
  structuredGrid->SetPoints(points);
  structuredGrid->GetCellData()->AddArray(cell_value);

  auto parallel_writer = vtkSmartPointer<vtkXMLPStructuredGridWriter>::New();
  parallel_writer->SetInputConnection(pf->GetOutputPort());
  parallel_writer->SetController(contr);
  parallel_writer->SetFileName(name);
  parallel_writer->SetNumberOfPieces(nranks);
  parallel_writer->SetStartPiece(rank);
  parallel_writer->SetEndPiece(rank);
  parallel_writer->SetDataModeToBinary();
  parallel_writer->Update();
  parallel_writer->Write();
}

int main (int argc, char *argv[]) {
  MPI_Init (&argc, &argv);

  //*** Create cartesian topology for grids ***//
  MPI_Comm comm_cart;
  int cartesian_rank;
  int dim[2] = {ny, nx};
  int coords[2];
  int periods[2] = {1, 1};
  MPI_Cart_create (MPI_COMM_WORLD, 2, dim, periods, 0, &comm_cart);
  MPI_Comm_rank (comm_cart, &cartesian_rank);
  MPI_Cart_coords (comm_cart, cartesian_rank, 2, coords);

  //******** Allocate memory and initialize cells ********//
  double* cells = new double[lx*ly];
  for (int j=0; j<ly; ++j)
    for (int i=0; i<lx; ++i)
      cells[i + j*lx] = 0;

  //********* Assign a value to cells *********//
  int cx = coords[1];
  int cy = coords[0];
  int l, m;
  // for loops starts with 2, because we don't initialize the ghost cells
  for (int j=2; j<ly-2; ++j)
    for (int i=2; i<lx-2; ++i) {
      l = i + cx*lx - 4*cx;
      m = j + cy*ly - 4*cy;
      if ((l-(nx*Lx+3)/2.)*(l-(nx*Lx+3)/2.) + (m-(ny*Ly+3)/2.)*(m-(ny*Ly+3)/2.) <= 400)
        cells[i + j*lx] = (i+j)*0.1;
      else
        cells[i + j*lx] = 0.1;
    }

  //********** Define new data types **********//
  const int dims {2};
  int arr_sizes[dims] = {ly, lx};
  int subar_sizes_x[dims] = {Ly, 2};
  int subar_sizes_y[dims] = {2, Lx};
  MPI_Datatype subar_right;
  MPI_Datatype subar_left;
  MPI_Datatype subar_top;
  MPI_Datatype subar_bottom;
  MPI_Datatype ghost_left;
  MPI_Datatype ghost_right;
  MPI_Datatype ghost_bottom;
  MPI_Datatype ghost_top;

  // send subarrays
  int subar_right_start[dims] = {2, Lx};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, subar_right_start, MPI_ORDER_C, MPI_DOUBLE, &subar_right);
  MPI_Type_commit (&subar_right);

  int subar_left_start[dims] = {2, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, subar_left_start, MPI_ORDER_C, MPI_DOUBLE, &subar_left);
  MPI_Type_commit (&subar_left);

  int subar_top_start[dims] = {Ly, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, subar_top_start, MPI_ORDER_C, MPI_DOUBLE, &subar_top);
  MPI_Type_commit (&subar_top);

  int subar_bottom_start[dims] = {2, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, subar_bottom_start, MPI_ORDER_C, MPI_DOUBLE, &subar_bottom);
  MPI_Type_commit (&subar_bottom);

  // recv subarrays
  int ghost_left_start[dims] = {2, 0};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, ghost_left_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_left);
  MPI_Type_commit (&ghost_left);

  int ghost_right_start[dims] = {2, Lx+2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, ghost_right_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_right);
  MPI_Type_commit (&ghost_right);

  int ghost_bottom_start[dims] = {0, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, ghost_bottom_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_bottom);
  MPI_Type_commit (&ghost_bottom);

  int ghost_top_start[dims] = {Ly+2, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, ghost_top_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_top);
  MPI_Type_commit (&ghost_top);

  //******** SENDING SUBARRAY ********//
  int sendcounts[4] = {1, 1, 1, 1};
  int recvcounts[4] = {1, 1, 1, 1};
  MPI_Aint sdispls[4] = {0, 0, 0, 0};
  MPI_Aint rdispls[4] = {0, 0, 0, 0};
  MPI_Datatype sendtypes[4] = {subar_bottom, subar_top, subar_left, subar_right};
  MPI_Datatype recvtypes[4] = {ghost_top, ghost_bottom, ghost_right, ghost_left};
  MPI_Neighbor_alltoallw (cells, sendcounts, sdispls, sendtypes, cells, recvcounts, rdispls, recvtypes, comm_cart);

  //******** Writing the cells using VTK ********//
  auto contr = vtkSmartPointer<vtkMPIController>::New();
  contr->Initialize(nullptr, nullptr, 1);
  parallel_vtk_writer (cells, "data/grid.pvts", coords, dim, contr);

  //******** Free data types ********//
  MPI_Type_free (&subar_right);
  MPI_Type_free (&subar_left);
  MPI_Type_free (&subar_top);
  MPI_Type_free (&subar_bottom);
  MPI_Type_free (&ghost_left);
  MPI_Type_free (&ghost_right);
  MPI_Type_free (&ghost_bottom);
  MPI_Type_free (&ghost_top);
  delete[] cells;

  MPI_Finalize ();
  return 0;
}

我正在使用以下CmakeList来使我的可执行文件:

cmake_minimum_required(VERSION 2.8)
PROJECT(TEST)
add_executable(TEST test_stackoverflow.cpp)
add_compile_options(-std=c++11)

find_package(VTK REQUIRED)
include(${VTK_USE_FILE})
target_link_libraries(TEST ${VTK_LIBRARIES})

find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH})
target_link_libraries(TEST ${MPI_LIBRARIES})

题

当我使用2 x 2进程网格([nx，ny] = [2,2])时，发送和接收正常工作。但是，当我使用4 x 4进程网格时，我看到错误的结果(错误的发送和接收)。 MPI_Neighbor_alltoallw()中sendtype和recvtypes的正确顺序是什么？

欢迎和赞赏任何改进代码的建议。

最佳答案

根据MPI 3.1标准第7.6章第314页

我凭经验发现，在2D笛卡尔通信器的情况下，顺序为:

置顶

底部

还剩下

对

(虽然我很难理解这里的逻辑，但我同时测试了Open MPI和MPICH)。

关于c++ - MPI_Neighbor_alltoallw()中正确的发送和接收顺序是什么？，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/50608184/