Skip to content

Commit 4f6a2d7

Browse files
brownbaerchenThomas
andauthored
Heterogeneous computing in spectral methods (#571)
* Started heterogeneous RBC implementation * Refactor * Implemented more general heterogeneous solves * Fixes --------- Co-authored-by: Thomas <[email protected]>
1 parent a7ea5e0 commit 4f6a2d7

File tree

2 files changed

+117
-4
lines changed

2 files changed

+117
-4
lines changed

pySDC/implementations/problem_classes/generic_spectral.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def __init__(
6464
max_cached_factorizations=12,
6565
spectral_space=True,
6666
real_spectral_coefficients=False,
67+
heterogeneous=False,
6768
debug=False,
6869
):
6970
"""
@@ -81,6 +82,7 @@ def __init__(
8182
max_cached_factorizations (int): Number of matrix decompositions to cache before starting eviction
8283
spectral_space (bool): If yes, the solution will not be transformed back after solving and evaluating the RHS, and is expected as input in spectral space to these functions
8384
real_spectral_coefficients (bool): If yes, allow only real values in spectral space, otherwise, allow complex.
85+
heterogeneous (bool): If yes, perform memory intensive sparse matrix operations on CPU
8486
debug (bool): Make additional tests at extra computational cost
8587
"""
8688
solver_args = {} if solver_args is None else solver_args
@@ -100,6 +102,7 @@ def __init__(
100102
'comm',
101103
'spectral_space',
102104
'real_spectral_coefficients',
105+
'heterogeneous',
103106
'debug',
104107
localVars=locals(),
105108
)
@@ -126,6 +129,26 @@ def __init__(
126129

127130
self.cached_factorizations = {}
128131

132+
if self.heterogeneous:
133+
self.__heterogeneous_setup = False
134+
135+
def heterogeneous_setup(self):
136+
if self.heterogeneous and not self.__heterogeneous_setup:
137+
CPU_only = ['BC_line_zero_matrix', 'BCs']
138+
both = ['Pl', 'Pr', 'L', 'M']
139+
140+
if self.useGPU:
141+
for key in CPU_only:
142+
setattr(self.spectral, key, getattr(self.spectral, key).get())
143+
144+
for key in both:
145+
setattr(self, f'{key}_CPU', getattr(self, key).get())
146+
else:
147+
for key in both:
148+
setattr(self, f'{key}_CPU', getattr(self, key))
149+
150+
self.__heterogeneous_setup = True
151+
129152
def __getattr__(self, name):
130153
"""
131154
Pass requests on to the helper if they are not directly attributes of this class for convenience.
@@ -232,6 +255,8 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs)
232255

233256
sp = self.spectral.sparse_lib
234257

258+
self.heterogeneous_setup()
259+
235260
if self.spectral_space:
236261
rhs_hat = rhs.copy()
237262
if u0 is not None:
@@ -256,8 +281,19 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs)
256281
rhs_hat = self.Pl @ rhs_hat.flatten()
257282

258283
if dt not in self.cached_factorizations.keys() or not self.solver_type.lower() == 'cached_direct':
259-
A = self.M + dt * self.L
260-
A = self.Pl @ self.spectral.put_BCs_in_matrix(A) @ self.Pr
284+
if self.heterogeneous:
285+
M = self.M_CPU
286+
L = self.L_CPU
287+
Pl = self.Pl_CPU
288+
Pr = self.Pr_CPU
289+
else:
290+
M = self.M
291+
L = self.L
292+
Pl = self.Pl
293+
Pr = self.Pr
294+
295+
A = M + dt * L
296+
A = Pl @ self.spectral.put_BCs_in_matrix(A) @ Pr
261297

262298
# if A.shape[0] < 200e20:
263299
# import matplotlib.pyplot as plt
@@ -289,7 +325,21 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs)
289325
if len(self.cached_factorizations) >= self.max_cached_factorizations:
290326
self.cached_factorizations.pop(list(self.cached_factorizations.keys())[0])
291327
self.logger.debug(f'Evicted matrix factorization for {dt=:.6f} from cache')
292-
self.cached_factorizations[dt] = self.spectral.linalg.factorized(A)
328+
329+
if self.heterogeneous:
330+
import scipy.sparse as sp
331+
332+
cpu_decomp = sp.linalg.splu(A)
333+
if self.useGPU:
334+
from cupyx.scipy.sparse.linalg import SuperLU
335+
336+
solver = SuperLU(cpu_decomp).solve
337+
else:
338+
solver = cpu_decomp.solve
339+
else:
340+
solver = self.spectral.linalg.factorized(A)
341+
342+
self.cached_factorizations[dt] = solver
293343
self.logger.debug(f'Cached matrix factorization for {dt=:.6f}')
294344
self.work_counters['factorizations']()
295345

pySDC/tests/test_problems/test_RayleighBenard3D.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,48 @@ def test_Poisson_problem_w():
191191
assert np.allclose(u_exact[i], u[i]), f'Unexpected solution in component {comp}'
192192

193193

194+
@pytest.mark.mpi4py
195+
@pytest.mark.parametrize('solver_type', ['gmres+ilu', 'bicgstab+ilu'])
196+
@pytest.mark.parametrize('N', [4, 16])
197+
@pytest.mark.parametrize('left_preconditioner', [True, False])
198+
@pytest.mark.parametrize('Dirichlet_recombination', [True, False])
199+
def test_solver_convergence(solver_type, N, left_preconditioner, Dirichlet_recombination):
200+
import numpy as np
201+
from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
202+
203+
fill_factor = 5 if left_preconditioner or Dirichlet_recombination else 10
204+
205+
P = RayleighBenard3D(
206+
nx=N,
207+
ny=N,
208+
nz=N,
209+
solver_type=solver_type,
210+
solver_args={'atol': 1e-10, 'rtol': 0},
211+
preconditioner_args={'fill_factor': fill_factor, 'drop_tol': 1e-4},
212+
left_preconditioner=left_preconditioner,
213+
Dirichlet_recombination=Dirichlet_recombination,
214+
)
215+
P_direct = RayleighBenard3D(nx=N, ny=N, nz=N, solver_type='cached_direct')
216+
217+
u0 = P.u_exact(0, noise_level=1.0e-3)
218+
219+
dt = 1.0e-3
220+
u_direct = P_direct.solve_system(u0.copy(), dt)
221+
u_good_ig = P.solve_system(u0.copy(), dt, u0=u_direct.copy())
222+
assert P.work_counters[P.solver_type].niter == 0
223+
assert np.allclose(u_good_ig, u_direct)
224+
225+
u = P.solve_system(u0.copy(), dt, u0=u0.copy())
226+
227+
error = abs(u - u_direct)
228+
assert error <= P.solver_args['atol'] * 1e3, error
229+
230+
if 'ilu' in solver_type.lower():
231+
size_LU = P_direct.cached_factorizations[dt].__sizeof__()
232+
size_iLU = P.cached_factorizations[dt].__sizeof__()
233+
assert size_iLU < size_LU, 'iLU does not require less memory than LU!'
234+
235+
194236
@pytest.mark.mpi4py
195237
def test_libraries():
196238
from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
@@ -231,9 +273,30 @@ def test_banded_matrix(preconditioning):
231273
), 'One-sided bandwidth of LU decomposition is larger than that of the full matrix!'
232274

233275

276+
@pytest.mark.cupy
277+
def test_heterogeneous_implementation():
278+
from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
279+
280+
params = {'nx': 2, 'ny': 2, 'nz': 2, 'useGPU': True}
281+
gpu = RayleighBenard3D(**params)
282+
het = RayleighBenard3D(**params, heterogeneous=True)
283+
284+
xp = gpu.xp
285+
286+
u0 = gpu.u_exact()
287+
288+
f = [me.eval_f(u0) for me in [gpu, het]]
289+
assert xp.allclose(*f)
290+
291+
un = [me.solve_system(u0, 1e-3) for me in [gpu, het]]
292+
assert xp.allclose(*un)
293+
294+
234295
if __name__ == '__main__':
235296
# test_eval_f(2**2, 2**1, 'x', False)
236297
# test_libraries()
237298
# test_Poisson_problems(4, 'u')
238299
# test_Poisson_problem_w()
239-
test_banded_matrix(False)
300+
# test_solver_convergence('bicgstab+ilu', 32, False, True)
301+
# test_banded_matrix(False)
302+
test_heterogeneous_implementation()

0 commit comments

Comments
 (0)