diff --git a/pySDC/implementations/problem_classes/generic_spectral.py b/pySDC/implementations/problem_classes/generic_spectral.py index e78c7b0847..0a9d937a7b 100644 --- a/pySDC/implementations/problem_classes/generic_spectral.py +++ b/pySDC/implementations/problem_classes/generic_spectral.py @@ -64,6 +64,7 @@ def __init__( max_cached_factorizations=12, spectral_space=True, real_spectral_coefficients=False, + heterogeneous=False, debug=False, ): """ @@ -81,6 +82,7 @@ def __init__( max_cached_factorizations (int): Number of matrix decompositions to cache before starting eviction spectral_space (bool): If yes, the solution will not be transformed back after solving and evaluating the RHS, and is expected as input in spectral space to these functions real_spectral_coefficients (bool): If yes, allow only real values in spectral space, otherwise, allow complex. + heterogeneous (bool): If yes, perform memory intensive sparse matrix operations on CPU debug (bool): Make additional tests at extra computational cost """ solver_args = {} if solver_args is None else solver_args @@ -100,6 +102,7 @@ def __init__( 'comm', 'spectral_space', 'real_spectral_coefficients', + 'heterogeneous', 'debug', localVars=locals(), ) @@ -126,6 +129,26 @@ def __init__( self.cached_factorizations = {} + if self.heterogeneous: + self.__heterogeneous_setup = False + + def heterogeneous_setup(self): + if self.heterogeneous and not self.__heterogeneous_setup: + CPU_only = ['BC_line_zero_matrix', 'BCs'] + both = ['Pl', 'Pr', 'L', 'M'] + + if self.useGPU: + for key in CPU_only: + setattr(self.spectral, key, getattr(self.spectral, key).get()) + + for key in both: + setattr(self, f'{key}_CPU', getattr(self, key).get()) + else: + for key in both: + setattr(self, f'{key}_CPU', getattr(self, key)) + + self.__heterogeneous_setup = True + def __getattr__(self, name): """ Pass requests on to the helper if they are not directly attributes of this class for convenience. @@ -232,6 +255,8 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs) sp = self.spectral.sparse_lib + self.heterogeneous_setup() + if self.spectral_space: rhs_hat = rhs.copy() if u0 is not None: @@ -256,8 +281,19 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs) rhs_hat = self.Pl @ rhs_hat.flatten() if dt not in self.cached_factorizations.keys() or not self.solver_type.lower() == 'cached_direct': - A = self.M + dt * self.L - A = self.Pl @ self.spectral.put_BCs_in_matrix(A) @ self.Pr + if self.heterogeneous: + M = self.M_CPU + L = self.L_CPU + Pl = self.Pl_CPU + Pr = self.Pr_CPU + else: + M = self.M + L = self.L + Pl = self.Pl + Pr = self.Pr + + A = M + dt * L + A = Pl @ self.spectral.put_BCs_in_matrix(A) @ Pr # if A.shape[0] < 200e20: # import matplotlib.pyplot as plt @@ -289,7 +325,21 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs) if len(self.cached_factorizations) >= self.max_cached_factorizations: self.cached_factorizations.pop(list(self.cached_factorizations.keys())[0]) self.logger.debug(f'Evicted matrix factorization for {dt=:.6f} from cache') - self.cached_factorizations[dt] = self.spectral.linalg.factorized(A) + + if self.heterogeneous: + import scipy.sparse as sp + + cpu_decomp = sp.linalg.splu(A) + if self.useGPU: + from cupyx.scipy.sparse.linalg import SuperLU + + solver = SuperLU(cpu_decomp).solve + else: + solver = cpu_decomp.solve + else: + solver = self.spectral.linalg.factorized(A) + + self.cached_factorizations[dt] = solver self.logger.debug(f'Cached matrix factorization for {dt=:.6f}') self.work_counters['factorizations']() diff --git a/pySDC/tests/test_problems/test_RayleighBenard3D.py b/pySDC/tests/test_problems/test_RayleighBenard3D.py index d2d1e4224b..0963632017 100644 --- a/pySDC/tests/test_problems/test_RayleighBenard3D.py +++ b/pySDC/tests/test_problems/test_RayleighBenard3D.py @@ -191,6 +191,48 @@ def test_Poisson_problem_w(): assert np.allclose(u_exact[i], u[i]), f'Unexpected solution in component {comp}' +@pytest.mark.mpi4py +@pytest.mark.parametrize('solver_type', ['gmres+ilu', 'bicgstab+ilu']) +@pytest.mark.parametrize('N', [4, 16]) +@pytest.mark.parametrize('left_preconditioner', [True, False]) +@pytest.mark.parametrize('Dirichlet_recombination', [True, False]) +def test_solver_convergence(solver_type, N, left_preconditioner, Dirichlet_recombination): + import numpy as np + from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D + + fill_factor = 5 if left_preconditioner or Dirichlet_recombination else 10 + + P = RayleighBenard3D( + nx=N, + ny=N, + nz=N, + solver_type=solver_type, + solver_args={'atol': 1e-10, 'rtol': 0}, + preconditioner_args={'fill_factor': fill_factor, 'drop_tol': 1e-4}, + left_preconditioner=left_preconditioner, + Dirichlet_recombination=Dirichlet_recombination, + ) + P_direct = RayleighBenard3D(nx=N, ny=N, nz=N, solver_type='cached_direct') + + u0 = P.u_exact(0, noise_level=1.0e-3) + + dt = 1.0e-3 + u_direct = P_direct.solve_system(u0.copy(), dt) + u_good_ig = P.solve_system(u0.copy(), dt, u0=u_direct.copy()) + assert P.work_counters[P.solver_type].niter == 0 + assert np.allclose(u_good_ig, u_direct) + + u = P.solve_system(u0.copy(), dt, u0=u0.copy()) + + error = abs(u - u_direct) + assert error <= P.solver_args['atol'] * 1e3, error + + if 'ilu' in solver_type.lower(): + size_LU = P_direct.cached_factorizations[dt].__sizeof__() + size_iLU = P.cached_factorizations[dt].__sizeof__() + assert size_iLU < size_LU, 'iLU does not require less memory than LU!' + + @pytest.mark.mpi4py def test_libraries(): from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D @@ -231,9 +273,30 @@ def test_banded_matrix(preconditioning): ), 'One-sided bandwidth of LU decomposition is larger than that of the full matrix!' +@pytest.mark.cupy +def test_heterogeneous_implementation(): + from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D + + params = {'nx': 2, 'ny': 2, 'nz': 2, 'useGPU': True} + gpu = RayleighBenard3D(**params) + het = RayleighBenard3D(**params, heterogeneous=True) + + xp = gpu.xp + + u0 = gpu.u_exact() + + f = [me.eval_f(u0) for me in [gpu, het]] + assert xp.allclose(*f) + + un = [me.solve_system(u0, 1e-3) for me in [gpu, het]] + assert xp.allclose(*un) + + if __name__ == '__main__': # test_eval_f(2**2, 2**1, 'x', False) # test_libraries() # test_Poisson_problems(4, 'u') # test_Poisson_problem_w() - test_banded_matrix(False) + # test_solver_convergence('bicgstab+ilu', 32, False, True) + # test_banded_matrix(False) + test_heterogeneous_implementation()