-
Notifications
You must be signed in to change notification settings - Fork 29
Open
Description
Description
I have been working with a slightly modified version of the eigenvalues demo script (included here). This code runs successfully (no errors, correct output as far as I can tell) when I run using up to a 10k x 10k matrix on a 8 node, 8 MPI process per node, 4 CPUs per MPI process grid. When I scale to a 100k x 100k matrix on a 10 node, 5 MPI process per node, 4 CPUs per MPI process grid, I get the error below (printed once for each of the 50 MPI processes):
terminate called after throwing an instance of 'std::domain_error'
what(): Input matrix contains Inf or NaN
[r6401:3333756] *** Process received signal ***
[r6401:3333756] Signal: Aborted (6)
[r6401:3333756] Signal code: (-6)
[r6401:3333758] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x43090)[0x7f59276f9090]
[r6401:3333758] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb)[0x7f59276f900b]
[r6401:3333758] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x12b)[0x7f59276d8859]
[r6401:3333758] [ 3] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa4ee6)[0x7f59279c2ee6]
[r6401:3333758] [ 4] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb6f8c)[0x7f59279d4f8c]
[r6401:3333758] [ 5] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb6ff7)[0x7f59279d4ff7]
[r6401:3333758] [ 6] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb7258)[0x7f59279d5258]
[r6401:3333758] [ 7] /path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/slate-2023.11.05-5ysppah/lib/libslate.so(_ZN5slate5stedcIfEEvRSt6vectorIT_SaIS2_EES5_RNS_6MatrixIS2_EERKSt3mapINS_6OptionENS_11OptionValueESt4lessISA_ESaISt4pairIKSA_SB_EEE+0x4e5)[0x7f5929c00935]
[r6403:3333758] [ 9] fast_eigenvals(+0xbecd)[0x563f5b231ecd]
[r6403:3333758] [10] fast_eigenvals(+0x5876)[0x563f5b22b876]
[r6403:3333758] [11] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f36cef0e083]
[r6403:3333758] [12] fast_eigenvals(+0x59ee)[0x563f5b22b9ee]
[r6403:3333758] *** End of error message ***
At the end, I then see:
--------------------------------------------------------------------------
mpirun noticed that process rank 27 with PID 662523 on node r6406 exited on signal 6 (Aborted).
--------------------------------------------------------------------------
Command exited with non-zero status 134
I trimmed this because this message is just repeated over and over, but if helpful, I can give the whole output.
Steps To Reproduce
- Compile the script (linking BLASpp, LAPACKpp, Intel MKL, and SLATE. I also use the unmodified utils.hh file from the examples SLATE tutorial (Tutorial Link).
- Batch the script using OpenMPI.
C++ Code
// Solve Hermitian eigenvalues A = X Lambda X^H
#include <slate/slate.hh>
#include "util.hh"
#include <string>
// Include I/O
#include <iostream>
#include <fstream>
#include <unistd.h>
int mpi_size = 0;
int mpi_rank = 0;
//------------------------------------------------------------------------------
template <typename T>
void hermitian_eig(std::string& output_file)
{
using real_t = blas::real_type<T>;
print_func( mpi_rank );
// Print hostname
char hostname[256];
gethostname( hostname, 256 );
printf( "Rank %d of %d on %s\n", mpi_rank, mpi_size, hostname );
// TODO: failing if n not divisible by nb?
int64_t n=100000, nb=500, p=10, q=5;
// Maybe can remove this
assert(mpi_size == p*q);
// Rank 0 prints out the parameters
if (mpi_rank == 0) {
printf( "Parameters: n %ld, nb %ld, p %ld, q %ld\n", n, nb, p, q );
}
//slate::HermitianMatrix<T> A( slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD );
slate::SymmetricMatrix<T> A( slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD );
A.insertLocalTiles();
random_matrix( A );
std::vector<real_t> Lambda( n );
// A = X Lambda X^H, eigenvalues only
// slate::eig_vals( A, Lambda ); // simplified API
//slate::Matrix<T> Xempty;
//slate::heev( slate::Job::NoVec, A, Lambda, Xempty );
slate::Matrix<T> X( n, n, nb, p, q, MPI_COMM_WORLD );
X.insertLocalTiles();
slate::eig( A, Lambda, X ); // simplified API
// // Write to file as csv
// if (mpi_rank == 0) {
// std::ofstream file(output_file);
// for (int i = 0; i < n; i++) {
// file << Lambda[i] << "\n";
// }
// file << std::endl;
// file.close();
// }
}
//------------------------------------------------------------------------------
int main( int argc, char** argv )
{
int provided = 0;
int err = MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
assert( err == 0 );
assert( provided == MPI_THREAD_MULTIPLE );
err = MPI_Comm_size( MPI_COMM_WORLD, &mpi_size );
assert( err == 0 );
// if (mpi_size != 4) {
// printf( "Usage: mpirun -np 4 %s # 4 ranks hard coded\n", argv[0] );
// return -1;
// }
err = MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
assert( err == 0 );
// so random_matrix is different on different ranks.
srand( 100 * mpi_rank );
std::string output_file = "eigenvals.csv";
hermitian_eig< float >( output_file );
err = MPI_Finalize();
assert( err == 0 );
// Print finished from rank 0
if (mpi_rank == 0) {
printf( "Finished\n" );
}
}Corresponding SLURM File
#!/bin/bash
#SBATCH --job-name="Eigendecomposition"
#SBATCH --partition=main
#SBATCH --time=72:00:00
#SBATCH --nodes=10
#SBATCH --tasks-per-node=5
#SBATCH --cpus-per-task=4
#SBATCH --mem-per-cpu=24GB
#SBATCH --output=logs/eigen-%A.o
#SBATCH --error=logs/eigen-%A.e
# Installation
# spack install -j 16 slate%gcc ~cuda ^intel-oneapi-mkl ^openmpi@4+legacylaunchers fabrics=auto schedulers=slurm
# Load software
spack load slate@2023.11.05
spack load blaspp
spack load lapackpp
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/blaspp-2023.11.05-jfb72an/lib:/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/lapackpp-2023.11.05-3ervrb4/lib
export MKL_NUM_THREADS=$SLURM_CPUS_PER_TASK
# Start time
STARTTIME=$(date)
# Run command
/usr/bin/time -p mpirun fast_eigenvals
# End time
ENDTIME=$(date)
echo "Job started at: $STARTTIME"
echo "Job ended at: $ENDTIME"Makefile
# Makefile to compile fast_eigenvals.cpp with slate and lapack (oneapi-mkl)
# Assumes BLAS++ and LAPACK++ are installed in SLATE directory.
# Otherwise would need to add
# -I${blaspp_dir}/include -I${lapackpp_dir}/include to CXXFLAGS,
# -L${blaspp_dir}/lib -L${lapackpp_dir}/lib and corresponding rpaths to LDFLAGS.
#
# Assumes everything built as shared libraries, so transitive dependencies
# are handled implicitly. That is, linking with -lslate implicitly links with
# -lblaspp -llapackpp -lopenblas.
# Substitute your choice of BLAS/LAPACK library for -lopenblas.
-include make.inc
slate_dir ?= /path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/slate-2023.11.05-5ysppah
scalapack_libs ?= -L${MKLROOT}/lib/intel64 -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
# CXXFLAGS add /opt/slate/include to include path.
# LDFLAGS add /opt/slate/lib to lib path.
# Using -rpath avoids need to add ${slate_dir}/lib to $LD_LIBRARY_PATH.
CXX = mpicxx
CXXFLAGS = -fopenmp -Wall -O3 -std=c++17 -MMD -I${slate_dir}/include -I/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/blaspp-2023.11.05-jfb72an/include -I/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/lapackpp-2023.11.05-3ervrb4/include -I${MKLROOT}/include
LDFLAGS = -fopenmp -L${slate_dir}/lib -Wl,-rpath,${slate_dir}/lib -L/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/blaspp-2023.11.05-jfb72an/lib -L/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/lapackpp-2023.11.05-3ervrb4/lib -L${MKLROOT}/lib/intel64 -Wl,-rpath,${MKLROOT}/lib/intel64
LIBS = -lblaspp -llapackpp -lmkl_gnu_thread -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl
# auto-detect OS
# $OSTYPE may not be exported from the shell, so echo it
ostype := $(shell echo $${OSTYPE})
ifneq ($(findstring darwin, $(ostype)),)
# MacOS is darwin
macos = 1
LDD = otool -L
else
LDD = ldd
endif
# ------------------------------------------------------------------------------
# Check if SLATE was compiled with cublas. If so, explicitly link cublas
# since it is called in the SLATE headers.
grep_cuda := ${shell ${LDD} ${slate_dir}/lib/libslate.so | grep cublas}
ifeq (${grep_cuda},)
${info SLATE without CUDA}
CXXFLAGS += -DSLATE_NO_CUDA
else
cuda_nvcc := ${shell which nvcc}
CUDA_ROOT ?= $(cuda_nvcc:/bin/nvcc=)
${info SLATE with CUDA ${CUDA_ROOT}}
ifeq (${cuda_nvcc},)
${error SLATE compiled with CUDA, but cannot find nvcc.}
endif
CXXFLAGS += -I${CUDA_ROOT}/include
LDFLAGS += -L${CUDA_ROOT}/lib -Wl,-rpath,${CUDA_ROOT}/lib
LIBS += -lcublas -lcudart
endif
# ------------------------------------------------------------------------------
# SLATE examples
# Link with -lslate.
# Implicitly links with -lblaspp -llapackpp -lopenblas.
slate_src = fast_eigenvals.cpp
slate_exe = ${basename ${slate_src}}
# ${slate_exe}: %: %.o
# ${CXX} -o $@ $^ \
# ${LDFLAGS} \
# -lslate ${LIBS}
${slate_exe}:
${CXX} ${CXXFLAGS} -c -o ${slate_exe}.o ${slate_src} ${LDFLAGS} -lslate ${LIBS}
${CXX} -o ${slate_exe} ${slate_exe}.o ${LDFLAGS} -lslate ${LIBS}
# ------------------------------------------------------------------------------
# Generic rules.
exe = ${slate_exe}
all: ${exe}
.DEFAULT_GOAL := all
.SUFFIXES:
%.o: %.cc
${CXX} ${CXXFLAGS} -c -o $@ $<
clean:
-rm -f ${exe} *.o
clean_exe:
-rm -f ${exe}
distclean: clean clean_run
-rm -f *.d
-include *.d
# ------------------------------------------------------------------------------
# Debugging
echo:
@echo "CXX = ${CXX}"
@echo "CXXFLAGS = ${CXXFLAGS}"
@echo "LDFLAGS = ${LDFLAGS}"
@echo "LIBS = ${LIBS}"
@echo "LDD = ${LDD}"
@echo
@echo "macos = ${macos}"
@echo "grep_cuda = ${grep_cuda}"
@echo "cuda_nvcc = ${cuda_nvcc}"
@echo "CUDA_ROOT = ${CUDA_ROOT}"
@echo
@echo "slate_dir = ${slate_dir}"
@echo "scalapack_libs = ${scalapack_libs}"
@echo
@echo "slate_src = ${slate_src}"
@echo "slate_exe = ${slate_exe}"
@echo
@echo "scalapack_src = ${scalapack_src}"
@echo "scalapack_exe = ${scalapack_exe}"
@echo
@echo "blaspp_src = ${blaspp_src}"
@echo "blaspp_exe = ${blaspp_exe}"
@echo
@echo "lapackpp_src = ${lapackpp_src}"
@echo "lapackpp_exe = ${lapackpp_exe}"
@echo
@echo "exe = ${exe}"
@echo
@echo "txt = ${txt}"Environment
All software other than SLURM and gcc was installed through spack:
- SLATE version / commit ID (e.g.,
git log --oneline -n 1):slate@2023.11.05 - How installed:
- git clone
- release tar file
- Spack
- module
- How compiled:
- makefile (include your
make.inc) - CMake (include your command line options)
- makefile (include your
- Compiler & version (e.g.,
mpicxx --version):gcc@9.4.0 - BLAS library (e.g., MKL, ESSL, OpenBLAS) & version:
intel-oneapi-mkl@2024.0.0 - MPI library & version (MPICH, Open MPI, Intel MPI, IBM Spectrum, Cray MPI, etc. Sometimes
mpicxx -vgives info.):openmpi@4.1.6 - OS: Ubuntu 20.04.6 LTS
- Hardware (CPUs, GPUs, nodes): CPU - Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels