Skip to content

Commit 5be8d61

Browse files
committed
Use error->one instead of error->all
Otherwise we can have a deadlock if only one MPI rank has the error
1 parent 1a6c5b5 commit 5be8d61

File tree

6 files changed

+59
-50
lines changed

6 files changed

+59
-50
lines changed

src/KOKKOS/metatomic_system_kokkos.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,18 @@ MetatomicSystemAdaptorKokkos<DeviceType>::MetatomicSystemAdaptorKokkos(LAMMPS *l
4747
this->strain = torch::eye(3, tensor_options);
4848
}
4949

50+
#include <iostream>
51+
#include "comm.h"
52+
5053
template<class DeviceType>
5154
void MetatomicSystemAdaptorKokkos<DeviceType>::setup_neighbors_remap_kk(metatomic_torch::System& system, NeighListKokkos<DeviceType>* list) {
5255
auto _ = MetatomicTimer("converting kokkos neighbors with ghosts remapping");
5356
auto dtype = system->positions().scalar_type();
5457

5558
auto total_n_atoms = atomKK->nlocal + atomKK->nghost;
5659

60+
std::cout << "rank = " << comm->me << " nlocal = " << atomKK->nlocal << " nghost = " << atomKK->nghost << std::endl;
61+
5762
{
5863
auto _ = MetatomicTimer("identifying ghosts and real atoms");
5964
/*-------------- this will be done on CPU for now ------------------------*/
@@ -360,7 +365,7 @@ metatomic_torch::System MetatomicSystemAdaptorKokkos<DeviceType>::system_from_lm
360365
// While Metatomic models can support mixed PBC settings, we currently
361366
// assume that the system is fully periodic and we throw an error otherwise
362367
if (!domain->xperiodic || !domain->yperiodic || !domain->zperiodic) {
363-
error->all(FLERR, "metatomic/kk currently requires a fully periodic system");
368+
error->one(FLERR, "metatomic/kk currently requires a fully periodic system");
364369
}
365370
auto pbc = torch::tensor(
366371
{domain->xperiodic, domain->yperiodic, domain->zperiodic},
@@ -379,7 +384,7 @@ metatomic_torch::System MetatomicSystemAdaptorKokkos<DeviceType>::system_from_lm
379384
assert(kk_list != nullptr);
380385
this->setup_neighbors_remap_kk(system, kk_list);
381386
} else {
382-
error->all(FLERR, "the kokkos version of metatomic requires remap_pairs to be true");
387+
error->one(FLERR, "the kokkos version of metatomic requires remap_pairs to be true");
383388
}
384389

385390
return system;

src/KOKKOS/pair_metatomic_kokkos.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ void PairMetatomicKokkos<DeviceType>::pick_device(torch::Device& device, const c
9999
auto requested_str = std::string(requested);
100100
std::transform(requested_str.begin(), requested_str.end(), requested_str.begin(), ::tolower);
101101
if (c10::DeviceTypeName(device.type(), /*lower_case=*/true) != requested_str) {
102-
error->all(FLERR,
102+
error->one(FLERR,
103103
"requested device '{}' does not match the device being used by kokkos '{}', "
104104
"use the non-kokkos version of this pair style to use a different "
105105
"device for the model and LAMMPS",

src/ML-METATOMIC/metatomic_system.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@ MetatomicSystemAdaptor::~MetatomicSystemAdaptor() {}
4545

4646
void MetatomicSystemAdaptor::add_nl_request(double cutoff, metatomic_torch::NeighborListOptions request) {
4747
if (cutoff > options_.interaction_range) {
48-
error->all(FLERR,
48+
error->one(FLERR,
4949
"Invalid metatomic model: one of the requested neighbor lists "
5050
"has a cutoff ({}) larger than the model interaction range ({})",
5151
cutoff, options_.interaction_range
5252
);
5353
} else if (cutoff < 0 || !std::isfinite(cutoff)) {
54-
error->all(FLERR,
54+
error->one(FLERR,
5555
"model requested an invalid cutoff for neighbors list: {} "
5656
"(cutoff in model units is {})",
5757
cutoff, request->cutoff()
@@ -278,7 +278,7 @@ void MetatomicSystemAdaptor::setup_neighbors_remap(metatomic_torch::System& syst
278278
});
279279
} else {
280280
// should be unreachable
281-
error->all(FLERR, "invalid dtype, this is a bug");
281+
error->one(FLERR, "invalid dtype, this is a bug");
282282
}
283283
}
284284
}
@@ -317,7 +317,7 @@ void MetatomicSystemAdaptor::setup_neighbors_remap(metatomic_torch::System& syst
317317
);
318318
} else {
319319
// should be unreachable
320-
error->all(FLERR, "invalid dtype, this is a bug");
320+
error->one(FLERR, "invalid dtype, this is a bug");
321321
}
322322

323323
{
@@ -408,7 +408,7 @@ void MetatomicSystemAdaptor::setup_neighbors_no_remap(metatomic_torch::System& s
408408
});
409409
} else {
410410
// should be unreachable
411-
error->all(FLERR, "invalid dtype, this is a bug");
411+
error->one(FLERR, "invalid dtype, this is a bug");
412412
}
413413
}
414414
}
@@ -446,7 +446,7 @@ void MetatomicSystemAdaptor::setup_neighbors_no_remap(metatomic_torch::System& s
446446
);
447447
} else {
448448
// should be unreachable
449-
error->all(FLERR, "invalid dtype, this is a bug");
449+
error->one(FLERR, "invalid dtype, this is a bug");
450450
}
451451

452452
{
@@ -526,7 +526,7 @@ metatomic_torch::System MetatomicSystemAdaptor::system_from_lmp(
526526
// While metatomic models can support mixed PBC settings, we currently
527527
// assume that the system is fully periodic and we throw an error otherwise
528528
if (!domain->xperiodic || !domain->yperiodic || !domain->zperiodic) {
529-
error->all(FLERR, "pair_style metatomic requires a fully periodic system");
529+
error->one(FLERR, "pair_style metatomic requires a fully periodic system");
530530
}
531531
auto pbc = torch::tensor(
532532
{domain->xperiodic, domain->yperiodic, domain->zperiodic},

src/ML-METATOMIC/metatomic_timer.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <mpi.h>
12
#include <mutex>
23
#include <iostream>
34

@@ -57,7 +58,10 @@ MetatomicTimer::~MetatomicTimer() {
5758
std::cerr << "\n" << indent << this->name_;
5859
}
5960

60-
std::cerr << " took " << elapsed / 1e6 << "ms" << std::flush;
61+
int rank;
62+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
63+
64+
std::cerr << " took " << elapsed / 1e6 << "ms (rank " << rank << ")" << std::flush;
6165
METATOMIC_TIMER_DEPTH -= 1;
6266
}
6367
}

src/ML-METATOMIC/metatomic_types.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ void PairMetatomicData::load_model(
4646

4747
this->model_path = path;
4848
if (this->model != nullptr) {
49-
lmp->error->all(FLERR, "torch model is already loaded");
49+
lmp->error->one(FLERR, "torch model is already loaded");
5050
}
5151

5252
torch::optional<std::string> extensions = torch::nullopt;
@@ -59,7 +59,7 @@ void PairMetatomicData::load_model(
5959
metatomic_torch::load_atomistic_model(this->model_path, extensions)
6060
);
6161
} catch (const c10::Error& e) {
62-
lmp->error->all(FLERR, "failed to load metatomic model at '{}': {}", path, e.what());
62+
lmp->error->one(FLERR, "failed to load metatomic model at '{}': {}", path, e.what());
6363
}
6464

6565
auto capabilities_ivalue = this->model->run_method("capabilities");

0 commit comments

Comments
 (0)