1+ #= ========================================================================================
2+ # Magnetization Simulation
3+ #
4+ # This file implements the core magnetization simulation functionality for BlochSimulators.jl
5+ #
6+ # ## High-Level Workflow
7+ #
8+ # 1. User calls `simulate_magnetization(sequence, parameters)`
9+ # 2. The function automatically detects the computational resource based on input types:
10+ # - Single voxel → CPU1 (single-threaded)
11+ # - CPU array → CPUThreads (multi-threaded, default)
12+ # - CuArray → CUDALibs (GPU)
13+ # - DArray → CPUProcesses (distributed across workers)
14+ # 3. Memory is allocated for the output magnetization array
15+ # 4. The simulation is dispatched to the appropriate parallel execution strategy
16+ # 5. Results are returned as an array: (output_size(sequence)..., num_voxels)
17+ #
18+ # ## Architecture Overview
19+ #
20+ # The simulation uses multiple dispatch on `AbstractResource` types to select the execution
21+ # strategy:
22+ # - `CPU1()`: Serial execution, one voxel at a time
23+ # - `CPUThreads()`: Parallel execution using Julia threads (`Threads.@threads`)
24+ # - `CPUProcesses()`: Distributed execution across workers using `DArray`
25+ # - `CUDALibs()`: GPU execution with CUDA kernels (WARPSIZE threads per voxel)
26+ #
27+ # Each strategy calls the sequence-specific `simulate_magnetization!` method that
28+ # numerically integrates the Bloch equations for the given sequence and tissue properties
29+ # using either the isochromat or the EPG formalism.
30+ #
31+ # ## For Users
32+ #
33+ # Most users should call the convenience function without specifying a resource:
34+ # `magnetization = simulate_magnetization(sequence, parameters)`
35+ # Or, in case an NVIDIA GPU is available, one would typically do:
36+ # `magnetization = simulate_magnetization(gpu(f32(sequence,)) gpu(f32(parameters)))`
37+ #
38+ # The function will automatically select the best computational resource based on your input.
39+ #
40+ # ## For Developers
41+ #
42+ # When implementing a new sequence type (subtype of `BlochSimulator`), you must define:
43+ # 1. `simulate_magnetization!(output, sequence, state, tissue_properties)` - Core simulation
44+ # 2. `initialize_states(resource, sequence)` - Initialize state vectors
45+ # 3. `output_size(sequence)` - Dimensions of output per voxel
46+ # 4. `output_eltype(sequence)` - Element type of output (typically ComplexF32 or ComplexF64)
47+ #
48+ # See the Developer Guide in README.md for more details.
49+ #= ========================================================================================#
50+
51+ #= ========================================================================================
52+ # PUBLIC API
53+ =========================================================================================#
54+
155"""
256 simulate_magnetization(resource, sequence, parameters)
357
@@ -47,6 +101,42 @@ function simulate_magnetization(
47101 return magnetization
48102end
49103
104+ #= ========================================================================================
105+ # CONVENIENCE METHODS - Automatic Resource Selection
106+ =========================================================================================#
107+
108+ """
109+ simulate_magnetization(sequence, parameters)
110+
111+ Convenience function to simulate magnetization without specifying the computational
112+ resource. The function automatically selects the appropriate resource based on the type of
113+ the `sequence` and `parameters`. The fallback case is to use multi-threaded CPU
114+ computations.
115+
116+ # Automatic Resource Selection Rules
117+ - Single `AbstractTissueProperties` → `CPU1()` (single-threaded)
118+ - `StructArray` on CPU → `CPUThreads()` (multi-threaded, default)
119+ - `CuArray` → `CUDALibs()` (GPU)
120+ - `DArray` → `CPUProcesses()` (distributed across workers)
121+ """
122+ function simulate_magnetization(sequence::BlochSimulator, parameters::StructArray)
123+
124+ sequence_on_gpu = _all_arrays_are_cuarrays(sequence)
125+ parameters_on_gpu = _all_arrays_are_cuarrays(parameters)
126+
127+ if xor(sequence_on_gpu, parameters_on_gpu)
128+ throw(ArgumentError("Both sequence and parameters must be on the GPU or not on the GPU"))
129+ end
130+
131+ if sequence_on_gpu && parameters_on_gpu
132+ return simulate_magnetization(CUDALibs(), sequence, parameters)
133+ elseif parameters isa DArray
134+ return simulate_magnetization(CPUProcesses(), sequence, parameters)
135+ else
136+ return simulate_magnetization(CPUThreads(), sequence, parameters)
137+ end
138+ end
139+
50140"""
51141If no `resource` is provided, the simulation is performed on the CPU in a multi-threaded
52142fashion by default.
@@ -80,31 +170,13 @@ function simulate_magnetization(sequence, tissue_properties::AbstractTissuePrope
80170 simulate_magnetization(CPU1(), sequence, parameters)
81171end
82172
83- """
84- function simulate_magnetization(sequence, parameters)
85-
86- Convenience function to simulate magnetization without specifying the computational
87- resource. The function automatically selects the appropriate resource based on the type of
88- the `sequence` and `parameters`. The fallback case is to use multi-threaded CPU
89- computations.
90- """
91- function simulate_magnetization (sequence:: BlochSimulator , parameters:: StructArray )
92-
93- sequence_on_gpu = _all_arrays_are_cuarrays (sequence)
94- parameters_on_gpu = _all_arrays_are_cuarrays (parameters)
95-
96- if xor (sequence_on_gpu, parameters_on_gpu)
97- throw (ArgumentError (" Both sequence and parameters must be on the GPU or not on the GPU" ))
98- end
99-
100- if sequence_on_gpu && parameters_on_gpu
101- return simulate_magnetization (CUDALibs (), sequence, parameters)
102- elseif parameters isa DArray
103- return simulate_magnetization (CPUProcesses (), sequence, parameters)
104- else
105- return simulate_magnetization (CPUThreads (), sequence, parameters)
106- end
107- end
173+ #= ========================================================================================
174+ # RESOURCE-SPECIFIC IMPLEMENTATIONS
175+ #
176+ # These methods implement the parallel execution strategy for each computational resource.
177+ # They are called by the public API after memory allocation and dispatch based on the
178+ # `AbstractResource` type.
179+ =========================================================================================#
108180
109181"""
110182 simulate_magnetization!(magnetization, resource, sequence, parameters)
@@ -113,55 +185,62 @@ Simulate the magnetization response for all combinations of tissue properties co
113185`parameters` and stores the results in the pre-allocated `magnetization` array. The actual
114186implementation depends on the computational resource specified in `resource`.
115187
116- This function is called by `simulate_magnetization` and is not intended considered part of
117- te public API.
188+ This function is called by `simulate_magnetization` and is not considered part of the
189+ public API.
118190"""
119191function simulate_magnetization!(magnetization, ::CPU1, sequence, parameters)
120- vd = length (size (magnetization))
192+ # Serial execution: Loop over voxels one at a time
193+ vd = length(size(magnetization)) # Get voxel dimension (last dimension)
121194 for voxel ∈ eachindex(parameters)
195+ # Initialize state vector for this voxel
122196 state = initialize_states(CPU1(), sequence)
197+ # Simulate and store result in the voxel's slice of the output array
123198 simulate_magnetization!(selectdim(magnetization, vd, voxel), sequence, state, parameters[voxel])
124199 end
125200 return nothing
126201end
127202
128203function simulate_magnetization!(magnetization, ::CPUThreads, sequence, parameters)
129- vd = length (size (magnetization))
204+ # Parallel execution using Julia threads: Each thread processes voxels independently
205+ vd = length(size(magnetization)) # Get voxel dimension (last dimension)
130206 Threads.@threads for voxel ∈ eachindex(parameters)
207+ # Each thread initializes its own state vector (thread-safe)
131208 state = initialize_states(CPUThreads(), sequence)
209+ # Simulate and store result in the voxel's slice of the output array
132210 simulate_magnetization!(selectdim(magnetization, vd, voxel), sequence, state, parameters[voxel])
133211 end
134212 return nothing
135213end
136214
137215function simulate_magnetization!(magnetization, ::CPUProcesses, sequence, parameters::DArray)
138- # Spawn tasks on each worker
216+ # Distributed execution across workers: Each worker processes its local partition
217+ # The `:lp` selector accesses the local partition of the DArray on each worker
139218 @sync [@spawnat p simulate_magnetization!(magnetization[:lp], CPU1(), sequence, parameters[:lp]) for p in workers()]
140219 return nothing
141220end
142221
143222function simulate_magnetization!(magnetization, ::CUDALibs, sequence, parameters)
144-
145- # Each voxel gets assigned `WARPSIZE` threads. Since `THREADS_PER_BLOCK `is fixed,
146- # the number of required blocks can then be calculated
223+ # GPU execution: Launch CUDA kernel with multiple threads per voxel
224+ # Each voxel gets assigned `WARPSIZE` threads for coalesced memory access
225+ # Since `THREADS_PER_BLOCK` is fixed, the number of required blocks is calculated
147226 nr_voxels = length(parameters)
148227 nr_blocks = cld(nr_voxels * WARPSIZE, THREADS_PER_BLOCK)
149228
150- # define kernel
229+ # Define kernel function (executed on GPU)
151230 magnetization_kernel!(magnetization, sequence, parameters) = begin
152231
153- # get voxel index
232+ # Calculate voxel index from block and thread indices
154233 voxel = cld((blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x, WARPSIZE)
155234
156- # # initialize state that gets updated during time integration
235+ # Initialize state that gets updated during time integration
157236 states = initialize_states(CUDALibs(), sequence)
158237
159- # do nothing if voxel index is out of bounds
238+ # Do nothing if voxel index is out of bounds (extra threads in last block)
160239 if voxel > length(parameters)
161240 return nothing
162241 end
163242
164- # run simulation for voxel
243+ # Run simulation for this voxel
165244 simulate_magnetization!(
166245 view(magnetization, :, voxel),
167246 sequence,
@@ -170,21 +249,28 @@ function simulate_magnetization!(magnetization, ::CUDALibs, sequence, parameters
170249 )
171250 end
172251
252+ # Launch kernel and synchronize
173253 CUDA.@sync begin
174254 @cuda blocks = nr_blocks threads = THREADS_PER_BLOCK magnetization_kernel!(magnetization, sequence, parameters)
175255 end
176256 return nothing
177257end
178258
259+ #= ========================================================================================
260+ # INTERNAL UTILITIES
261+ #
262+ # Helper functions for memory allocation and array management.
263+ =========================================================================================#
264+
179265"""
180266 _allocate_magnetization_array(resource, sequence, parameters)
181267
182268Allocate an array to store the output of the Bloch simulations (per voxel, echo times only)
183269to be performed with the `sequence`. For each `BlochSimulator`, methods should have been
184270added to `output_eltype` and `output_size` for this function to work properly.
185271
186- This function is called by `simulate_magnetization` and is not intended considered part of
187- te public API.
272+ This function is called by `simulate_magnetization` and is not considered part of the
273+ public API.
188274
189275# Returns
190276- `magnetization_array`: An array allocated on the specified `resource`, formatted to store
@@ -206,8 +292,8 @@ Allocate an array on the specified `resource` with the given element type `_elty
206292`resource` is `CUDALibs()`, the array is allocated on the GPU. For `CPUProcesses()`, the
207293array is distributed in the "voxel"-dimension over multiple CPU workers.
208294
209- This function is called by `_allocate_magnetization_array` and is not intended considered
210- part of te public API.
295+ This function is called by `_allocate_magnetization_array` and is not considered part of
296+ the public API.
211297"""
212298function _allocate_array_on_resource(::Union{CPU1,CPUThreads}, _eltype, _size)
213299 return zeros(_eltype, _size...)
0 commit comments