Skip to content

Commit 3a020a9

Browse files
committed
mpi: improved debug logging and better MPIError
Signed-off-by: Torbjörn Klatt <[email protected]>
1 parent dc752ce commit 3a020a9

File tree

2 files changed

+30
-27
lines changed

2 files changed

+30
-27
lines changed

include/pfasst/mpi_communicator.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ namespace pfasst
1919
: public runtime_error
2020
{
2121
public:
22-
explicit MPIError(const string& msg="");
22+
MPIError(const string& msg="");
2323
virtual const char* what() const throw();
24+
static MPIError from_code(const int err_code);
2425
};
2526

2627

src/pfasst/mpi_communicator_impl.hpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ namespace pfasst
1616
return (string("mpi error: ") + string(runtime_error::what())).c_str();
1717
}
1818

19+
MPIError MPIError::from_code(const int err_code)
20+
{
21+
char err_str[MPI_MAX_ERROR_STRING];
22+
int err_len = 0;
23+
MPI_Error_string(err_code, err_str, &err_len);
24+
return MPIError("MPI Error: " + string(err_str, err_len) + " (code=" + to_string(err_code) + ")");
25+
}
26+
1927

2028
MPICommunicator::MPICommunicator()
2129
{}
@@ -75,8 +83,10 @@ namespace pfasst
7583

7684
void MPIStatus::set_converged(bool converged)
7785
{
78-
CLOG(DEBUG, "Controller") << "set converged to " << boolalpha << converged;
86+
CLOG(DEBUG, "Controller") << "set converged for rank " << this->comm->rank() << " to "
87+
<< "'" << boolalpha << converged << "'";
7988
this->converged.at(this->comm->rank()) = converged;
89+
assert(this->converged.at(this->comm->rank()) == converged);
8090
}
8191

8292
bool MPIStatus::get_converged(int rank)
@@ -95,18 +105,11 @@ namespace pfasst
95105
if (mpi->size() == 1) { return; }
96106
if (mpi->rank() == mpi->size() - 1) { return; }
97107

98-
int iconverged = converged.at(mpi->rank()) ? 1 : 0;
99-
108+
int iconverged = converged.at(mpi->rank()) ? IStatus::CONVERGED : IStatus::NOT_CONVERGED;
100109
int dest_rank = (mpi->rank() + 1) % mpi->size();
101-
CLOG(DEBUG, "Controller") << "sending status " << iconverged
102-
<< " to " << dest_rank << " of communicator " << mpi->name();
103-
104-
int err = MPI_Send(&iconverged, sizeof(int), MPI_INT,
105-
dest_rank, 1, mpi->comm);
106110

107-
if (err != MPI_SUCCESS) {
108-
throw MPIError();
109-
}
111+
int err = MPI_Send(&iconverged, sizeof(int), MPI_INT, dest_rank, 1, mpi->comm);
112+
if (err != MPI_SUCCESS) { throw MPIError::from_code(err); }
110113
}
111114

112115
void MPIStatus::recv()
@@ -115,34 +118,33 @@ namespace pfasst
115118
if (mpi->size() == 1) { return; }
116119
if (mpi->rank() == 0) { return; }
117120

118-
if (get_converged(mpi->rank()-1)) {
119-
CLOG(DEBUG, "Controller") << "skipping status recv";
121+
if (get_converged(mpi->rank() - 1)) {
122+
CLOG(DEBUG, "Controller") << "skipping status recv as previous is stored as converged";
120123
return;
121124
}
122125

123126
MPI_Status stat;
124127
int iconverged;
125128
int src_rank = (mpi->rank() - 1) % mpi->size();
126-
int err = MPI_Recv(&iconverged, sizeof(iconverged), MPI_INT,
127-
src_rank, 1, mpi->comm, &stat);
128-
129-
if (err != MPI_SUCCESS) {
130-
throw MPIError();
131-
}
132-
133-
converged.at(mpi->rank()-1) = iconverged == 1 ? true : false;
129+
int err = MPI_Recv(&iconverged, sizeof(iconverged), MPI_INT, src_rank, 1, mpi->comm, &stat);
130+
if (err != MPI_SUCCESS) { throw MPIError::from_code(err); }
134131

135-
CLOG(DEBUG, "Controller") << "recieved status " << iconverged
136-
<< " from rank " << src_rank << " of communicator " << mpi->name();
132+
converged.at(mpi->rank() - 1) = (iconverged == IStatus::CONVERGED) ? true : false;
137133
}
138134
} // ::pfasst::mpi
139135
} // ::pfasst
140136

141137

142138
MAKE_LOGGABLE(MPI_Status, mpi_status, os)
143139
{
144-
os << "MPI_Status(source=" << mpi_status.MPI_SOURCE << ", "
145-
<< "tag=" << mpi_status.MPI_TAG << ", "
146-
<< "error=" << mpi_status.MPI_ERROR << ")";
140+
if ( mpi_status.MPI_TAG == MPI_ANY_TAG
141+
&& mpi_status.MPI_SOURCE == MPI_ANY_SOURCE
142+
&& mpi_status.MPI_ERROR == MPI_SUCCESS) {
143+
os << "MPI_Status(empty)";
144+
} else {
145+
os << "MPI_Status(source=" << to_string(mpi_status.MPI_SOURCE) << ", "
146+
<< "tag=" << to_string(mpi_status.MPI_TAG) << ", "
147+
<< "error=" << to_string(mpi_status.MPI_ERROR) << ")";
148+
}
147149
return os;
148150
}

0 commit comments

Comments
 (0)