Skip to content

Input matrix contains Inf or NaN on Large Eigendecomposition #180

@rlangefe

Description

@rlangefe

Description
I have been working with a slightly modified version of the eigenvalues demo script (included here). This code runs successfully (no errors, correct output as far as I can tell) when I run using up to a 10k x 10k matrix on a 8 node, 8 MPI process per node, 4 CPUs per MPI process grid. When I scale to a 100k x 100k matrix on a 10 node, 5 MPI process per node, 4 CPUs per MPI process grid, I get the error below (printed once for each of the 50 MPI processes):

terminate called after throwing an instance of 'std::domain_error'
  what():  Input matrix contains Inf or NaN
[r6401:3333756] *** Process received signal ***
[r6401:3333756] Signal: Aborted (6)
[r6401:3333756] Signal code:  (-6)
[r6401:3333758] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x43090)[0x7f59276f9090]
[r6401:3333758] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb)[0x7f59276f900b]
[r6401:3333758] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x12b)[0x7f59276d8859]
[r6401:3333758] [ 3] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa4ee6)[0x7f59279c2ee6]
[r6401:3333758] [ 4] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb6f8c)[0x7f59279d4f8c]
[r6401:3333758] [ 5] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb6ff7)[0x7f59279d4ff7]
[r6401:3333758] [ 6] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb7258)[0x7f59279d5258]
[r6401:3333758] [ 7] /path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/slate-2023.11.05-5ysppah/lib/libslate.so(_ZN5slate5stedcIfEEvRSt6vectorIT_SaIS2_EES5_RNS_6MatrixIS2_EERKSt3mapINS_6OptionENS_11OptionValueESt4lessISA_ESaISt4pairIKSA_SB_EEE+0x4e5)[0x7f5929c00935]
[r6403:3333758] [ 9] fast_eigenvals(+0xbecd)[0x563f5b231ecd]
[r6403:3333758] [10] fast_eigenvals(+0x5876)[0x563f5b22b876]
[r6403:3333758] [11] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f36cef0e083]
[r6403:3333758] [12] fast_eigenvals(+0x59ee)[0x563f5b22b9ee]
[r6403:3333758] *** End of error message ***

At the end, I then see:

--------------------------------------------------------------------------
mpirun noticed that process rank 27 with PID 662523 on node r6406 exited on signal 6 (Aborted).
--------------------------------------------------------------------------
Command exited with non-zero status 134

I trimmed this because this message is just repeated over and over, but if helpful, I can give the whole output.

Steps To Reproduce

  1. Compile the script (linking BLASpp, LAPACKpp, Intel MKL, and SLATE. I also use the unmodified utils.hh file from the examples SLATE tutorial (Tutorial Link).
  2. Batch the script using OpenMPI.

C++ Code

// Solve Hermitian eigenvalues A = X Lambda X^H
#include <slate/slate.hh>

#include "util.hh"

#include <string>

// Include I/O
#include <iostream>
#include <fstream>
#include <unistd.h>

int mpi_size = 0;
int mpi_rank = 0;

//------------------------------------------------------------------------------
template <typename T>
void hermitian_eig(std::string& output_file)
{
    using real_t = blas::real_type<T>;

    print_func( mpi_rank );
    
    // Print hostname
    char hostname[256];
    gethostname( hostname, 256 );
    printf( "Rank %d of %d on %s\n", mpi_rank, mpi_size, hostname );

    // TODO: failing if n not divisible by nb?
    int64_t n=100000, nb=500, p=10, q=5;
    
    // Maybe can remove this
    assert(mpi_size == p*q);

    // Rank 0 prints out the parameters
    if (mpi_rank == 0) {
        printf( "Parameters: n %ld, nb %ld, p %ld, q %ld\n", n, nb, p, q );
    }

    //slate::HermitianMatrix<T> A( slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD );
    slate::SymmetricMatrix<T> A( slate::Uplo::Lower, n, nb, p, q, MPI_COMM_WORLD );
    A.insertLocalTiles();
    random_matrix( A );

    std::vector<real_t> Lambda( n );

    // A = X Lambda X^H, eigenvalues only
    // slate::eig_vals( A, Lambda );  // simplified API

    //slate::Matrix<T> Xempty;
    //slate::heev( slate::Job::NoVec, A, Lambda, Xempty );

    slate::Matrix<T> X( n, n, nb, p, q, MPI_COMM_WORLD );
    X.insertLocalTiles();
    slate::eig( A, Lambda, X );   // simplified API

    // // Write to file as csv
    // if (mpi_rank == 0) {
    //     std::ofstream file(output_file);
    //     for (int i = 0; i < n; i++) {
    //         file << Lambda[i] << "\n";
    //     }
    //     file << std::endl;
    //     file.close();
    // }
}

//------------------------------------------------------------------------------
int main( int argc, char** argv )
{   

    int provided = 0;
    int err = MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
    assert( err == 0 );
    assert( provided == MPI_THREAD_MULTIPLE );

    err = MPI_Comm_size( MPI_COMM_WORLD, &mpi_size );
    assert( err == 0 );
    // if (mpi_size != 4) {
    //     printf( "Usage: mpirun -np 4 %s  # 4 ranks hard coded\n", argv[0] );
    //     return -1;
    // }

    err = MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
    assert( err == 0 );

    // so random_matrix is different on different ranks.
    srand( 100 * mpi_rank );

    std::string output_file = "eigenvals.csv";

    hermitian_eig< float >( output_file );

    err = MPI_Finalize();
    assert( err == 0 );

    // Print finished from rank 0
    if (mpi_rank == 0) {
        printf( "Finished\n" );
    }
}

Corresponding SLURM File

#!/bin/bash
#SBATCH --job-name="Eigendecomposition"
#SBATCH --partition=main
#SBATCH --time=72:00:00
#SBATCH --nodes=10
#SBATCH --tasks-per-node=5
#SBATCH --cpus-per-task=4
#SBATCH --mem-per-cpu=24GB
#SBATCH --output=logs/eigen-%A.o
#SBATCH --error=logs/eigen-%A.e

# Installation
# spack install -j 16 slate%gcc ~cuda ^intel-oneapi-mkl ^openmpi@4+legacylaunchers fabrics=auto schedulers=slurm

# Load software
spack load slate@2023.11.05
spack load blaspp
spack load lapackpp

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/blaspp-2023.11.05-jfb72an/lib:/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/lapackpp-2023.11.05-3ervrb4/lib

export MKL_NUM_THREADS=$SLURM_CPUS_PER_TASK

# Start time
STARTTIME=$(date)

# Run command
/usr/bin/time -p mpirun fast_eigenvals

# End time
ENDTIME=$(date)
echo "Job started at: $STARTTIME"
echo "Job ended at: $ENDTIME"

Makefile

# Makefile to compile fast_eigenvals.cpp with slate and lapack (oneapi-mkl)
# Assumes BLAS++ and LAPACK++ are installed in SLATE directory.
# Otherwise would need to add
#   -I${blaspp_dir}/include -I${lapackpp_dir}/include to CXXFLAGS,
#   -L${blaspp_dir}/lib -L${lapackpp_dir}/lib and corresponding rpaths to LDFLAGS.
#
# Assumes everything built as shared libraries, so transitive dependencies
# are handled implicitly. That is, linking with -lslate implicitly links with
# -lblaspp -llapackpp -lopenblas.
# Substitute your choice of BLAS/LAPACK library for -lopenblas.

-include make.inc

slate_dir ?= /path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/slate-2023.11.05-5ysppah
scalapack_libs ?= -L${MKLROOT}/lib/intel64 -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64

# CXXFLAGS add /opt/slate/include to include path.
# LDFLAGS  add /opt/slate/lib     to lib path.
# Using -rpath avoids need to add ${slate_dir}/lib to $LD_LIBRARY_PATH.
CXX      = mpicxx
CXXFLAGS = -fopenmp -Wall -O3 -std=c++17 -MMD -I${slate_dir}/include -I/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/blaspp-2023.11.05-jfb72an/include -I/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/lapackpp-2023.11.05-3ervrb4/include -I${MKLROOT}/include
LDFLAGS  = -fopenmp -L${slate_dir}/lib -Wl,-rpath,${slate_dir}/lib -L/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/blaspp-2023.11.05-jfb72an/lib -L/path/to/spack/install/spack/opt/spack/linux-ubuntu20.04-haswell/gcc-9.4.0/lapackpp-2023.11.05-3ervrb4/lib -L${MKLROOT}/lib/intel64 -Wl,-rpath,${MKLROOT}/lib/intel64
LIBS     = -lblaspp -llapackpp -lmkl_gnu_thread -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl

# auto-detect OS
# $OSTYPE may not be exported from the shell, so echo it
ostype := $(shell echo $${OSTYPE})
ifneq ($(findstring darwin, $(ostype)),)
    # MacOS is darwin
    macos = 1
    LDD = otool -L
else
    LDD = ldd
endif


# ------------------------------------------------------------------------------
# Check if SLATE was compiled with cublas. If so, explicitly link cublas
# since it is called in the SLATE headers.
grep_cuda := ${shell ${LDD} ${slate_dir}/lib/libslate.so | grep cublas}
ifeq (${grep_cuda},)
    ${info SLATE without CUDA}
    CXXFLAGS += -DSLATE_NO_CUDA
else
    cuda_nvcc := ${shell which nvcc}
    CUDA_ROOT ?= $(cuda_nvcc:/bin/nvcc=)
    ${info SLATE with CUDA ${CUDA_ROOT}}
    ifeq (${cuda_nvcc},)
        ${error SLATE compiled with CUDA, but cannot find nvcc.}
    endif

    CXXFLAGS += -I${CUDA_ROOT}/include
    LDFLAGS  += -L${CUDA_ROOT}/lib -Wl,-rpath,${CUDA_ROOT}/lib
    LIBS     += -lcublas -lcudart
endif


# ------------------------------------------------------------------------------
# SLATE examples
# Link with -lslate.
# Implicitly links with -lblaspp -llapackpp -lopenblas.

slate_src = fast_eigenvals.cpp
slate_exe = ${basename ${slate_src}}

# ${slate_exe}: %: %.o
# 	${CXX} -o $@ $^ \
# 		${LDFLAGS} \
# 		-lslate ${LIBS}

${slate_exe}:
	${CXX} ${CXXFLAGS} -c -o ${slate_exe}.o ${slate_src} ${LDFLAGS} -lslate ${LIBS}
	${CXX} -o ${slate_exe} ${slate_exe}.o ${LDFLAGS} -lslate ${LIBS}


# ------------------------------------------------------------------------------
# Generic rules.

exe = ${slate_exe}

all: ${exe}

.DEFAULT_GOAL := all
.SUFFIXES:

%.o: %.cc
	${CXX} ${CXXFLAGS} -c -o $@ $<

clean:
	-rm -f ${exe} *.o

clean_exe:
	-rm -f ${exe}

distclean: clean clean_run
	-rm -f *.d

-include *.d

# ------------------------------------------------------------------------------
# Debugging

echo:
	@echo "CXX       = ${CXX}"
	@echo "CXXFLAGS  = ${CXXFLAGS}"
	@echo "LDFLAGS   = ${LDFLAGS}"
	@echo "LIBS      = ${LIBS}"
	@echo "LDD       = ${LDD}"
	@echo
	@echo "macos     = ${macos}"
	@echo "grep_cuda = ${grep_cuda}"
	@echo "cuda_nvcc = ${cuda_nvcc}"
	@echo "CUDA_ROOT = ${CUDA_ROOT}"
	@echo
	@echo "slate_dir = ${slate_dir}"
	@echo "scalapack_libs = ${scalapack_libs}"
	@echo
	@echo "slate_src = ${slate_src}"
	@echo "slate_exe = ${slate_exe}"
	@echo
	@echo "scalapack_src = ${scalapack_src}"
	@echo "scalapack_exe = ${scalapack_exe}"
	@echo
	@echo "blaspp_src = ${blaspp_src}"
	@echo "blaspp_exe = ${blaspp_exe}"
	@echo
	@echo "lapackpp_src = ${lapackpp_src}"
	@echo "lapackpp_exe = ${lapackpp_exe}"
	@echo
	@echo "exe = ${exe}"
	@echo
	@echo "txt = ${txt}"

Environment
All software other than SLURM and gcc was installed through spack:

  • SLATE version / commit ID (e.g., git log --oneline -n 1): slate@2023.11.05
  • How installed:
    • git clone
    • release tar file
    • Spack
    • module
  • How compiled:
    • makefile (include your make.inc)
    • CMake (include your command line options)
  • Compiler & version (e.g., mpicxx --version): gcc@9.4.0
  • BLAS library (e.g., MKL, ESSL, OpenBLAS) & version: intel-oneapi-mkl@2024.0.0
  • MPI library & version (MPICH, Open MPI, Intel MPI, IBM Spectrum, Cray MPI, etc. Sometimes mpicxx -v gives info.): openmpi@4.1.6
  • OS: Ubuntu 20.04.6 LTS
  • Hardware (CPUs, GPUs, nodes): CPU - Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions