Skip to content

Commit baa1c3e

Browse files
authored
[maint] scaling benchmark (#27)
* Add performance scaling benchmark * Scale the benchmark * I think we want to test with the default 32 frames * Lower the max time per test --------- Co-authored-by: Charles Guan <3221512+charlesincharge@users.noreply.github.com>
1 parent ee9c1fa commit baa1c3e

File tree

1 file changed

+338
-0
lines changed

1 file changed

+338
-0
lines changed
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
"""Performance scaling tests for mach beamformer.
2+
3+
These tests measure how mach performance scales with different dataset dimensions:
4+
- Number of voxels (via grid resolution)
5+
- Number of receive elements
6+
- Ensemble size (number of frames)
7+
8+
Usage:
9+
# Run scaling tests without benchmarking
10+
pytest tests/compare/test_performance_scaling.py --benchmark-disable
11+
12+
# Benchmark scaling performance
13+
pytest tests/compare/test_performance_scaling.py --benchmark-histogram --benchmark-sort=mean
14+
"""
15+
16+
import numpy as np
17+
import pytest
18+
from einops import rearrange
19+
20+
# Import PyMUST for data conversion
21+
pytest.importorskip("pymust")
22+
import pymust
23+
24+
try:
25+
import cupy as cp
26+
27+
HAS_CUPY = True
28+
except ImportError:
29+
HAS_CUPY = False
30+
cp = None
31+
32+
from mach import geometry, wavefront
33+
from mach.kernel import nb_beamform
34+
35+
36+
@pytest.fixture(scope="session")
37+
def pymust_iq_data(pymust_data, pymust_params):
38+
"""Extract RF/IQ data from PyMUST data file."""
39+
mat_data = pymust_data
40+
rf_data = mat_data["RF"].astype(float)
41+
iq_data = pymust.rf2iq(rf_data, pymust_params)
42+
return np.ascontiguousarray(iq_data, dtype=np.complex64)
43+
44+
45+
@pytest.fixture(scope="session")
46+
def base_scaling_data(pymust_iq_data, pymust_element_positions, pymust_params):
47+
"""Base data for scaling tests - single frame, baseline resolution."""
48+
49+
# Use single frame for baseline
50+
# single_frame_data = pymust_iq_data[:, :, :1].copy()
51+
52+
# Base grid with 1e-4 resolution (roughly 100 μm spacing)
53+
n_x = 251 # Same as original PyMUST tests
54+
n_z = 251
55+
56+
x_range = np.linspace(-1.25e-2, 1.25e-2, num=n_x, endpoint=True)
57+
y_range = np.array([0.0])
58+
z_range = np.linspace(1e-2, 3.5e-2, num=n_z, endpoint=True)
59+
60+
x_grid, z_grid = np.meshgrid(x_range, z_range, indexing="ij")
61+
y_grid = np.zeros_like(x_grid)
62+
63+
grid_points = np.stack([x_grid.flat, y_grid.flat, z_grid.flat], axis=-1)
64+
65+
# Compute transmit arrivals for plane wave
66+
sound_speed_m_s = pymust_params["c"]
67+
direction = np.asarray(geometry.ultrasound_angles_to_cartesian(0, 0))
68+
69+
transmit_arrivals_s = (
70+
wavefront.plane(
71+
origin_m=np.array([0, 0, 0]),
72+
points_m=grid_points,
73+
direction=direction,
74+
)
75+
/ sound_speed_m_s
76+
)
77+
78+
# Reorder IQ data for mach format
79+
iq_data_reordered = np.ascontiguousarray(
80+
rearrange(pymust_iq_data, "n_samples n_elements n_frames -> n_elements n_samples n_frames"),
81+
dtype=np.complex64,
82+
)
83+
84+
return {
85+
"iq_data": iq_data_reordered,
86+
"element_positions": pymust_element_positions,
87+
"scan_coords_m": grid_points.astype(np.float32),
88+
"transmit_arrivals_s": transmit_arrivals_s.flatten().astype(np.float32),
89+
"params": pymust_params,
90+
"grid_shape": (n_x, len(y_range), n_z),
91+
"x_range": x_range,
92+
"y_range": y_range,
93+
"z_range": z_range,
94+
}
95+
96+
97+
@pytest.mark.benchmark(
98+
group="scaling_voxels",
99+
min_time=0.1,
100+
max_time=1.0,
101+
min_rounds=3,
102+
warmup=True,
103+
warmup_iterations=1,
104+
)
105+
@pytest.mark.skipif(not HAS_CUPY, reason="CuPy not available")
106+
@pytest.mark.parametrize(
107+
"grid_resolution",
108+
[
109+
pytest.param(1e-4, id="res_1e-4"),
110+
pytest.param(5e-5, id="res_5e-5"),
111+
pytest.param(2e-5, id="res_2e-5"),
112+
pytest.param(1e-5, id="res_1e-5"),
113+
],
114+
)
115+
def test_scaling_voxels(benchmark, base_scaling_data, grid_resolution):
116+
"""Test performance scaling with number of voxels (grid resolution)."""
117+
118+
data = base_scaling_data
119+
params = data["params"]
120+
121+
# Calculate grid points for desired resolution
122+
x_extent = data["x_range"][-1] - data["x_range"][0]
123+
z_extent = data["z_range"][-1] - data["z_range"][0]
124+
125+
n_x = int(x_extent / grid_resolution) + 1
126+
n_z = int(z_extent / grid_resolution) + 1
127+
128+
# No grid size limit - let it scale to test performance properly
129+
130+
# Create new grid
131+
x_range = np.linspace(data["x_range"][0], data["x_range"][-1], num=n_x, endpoint=True)
132+
z_range = np.linspace(data["z_range"][0], data["z_range"][-1], num=n_z, endpoint=True)
133+
134+
x_grid, z_grid = np.meshgrid(x_range, z_range, indexing="ij")
135+
y_grid = np.zeros_like(x_grid)
136+
137+
grid_points = np.stack([x_grid.flat, y_grid.flat, z_grid.flat], axis=-1)
138+
139+
# Compute transmit arrivals for new grid
140+
sound_speed_m_s = float(params["c"])
141+
direction = np.asarray(geometry.ultrasound_angles_to_cartesian(0, 0))
142+
143+
transmit_arrivals_s = (
144+
wavefront.plane(
145+
origin_m=np.array([0, 0, 0]),
146+
points_m=grid_points,
147+
direction=direction,
148+
)
149+
/ sound_speed_m_s
150+
)
151+
152+
# Transfer to GPU
153+
iq_data_gpu = cp.asarray(data["iq_data"])
154+
element_positions_gpu = cp.asarray(data["element_positions"])
155+
scan_coords_gpu = cp.asarray(grid_points, dtype=cp.float32)
156+
transmit_arrivals_gpu = cp.asarray(transmit_arrivals_s.flatten(), dtype=cp.float32)
157+
158+
n_scan = scan_coords_gpu.shape[0]
159+
n_frames = iq_data_gpu.shape[2]
160+
out = cp.empty((n_scan, n_frames), dtype=cp.complex64)
161+
162+
def mach_voxel_scaling():
163+
"""mach GPU function for voxel scaling benchmark."""
164+
out[:] = 0.0
165+
result = nb_beamform(
166+
channel_data=iq_data_gpu,
167+
rx_coords_m=element_positions_gpu,
168+
scan_coords_m=scan_coords_gpu,
169+
tx_wave_arrivals_s=transmit_arrivals_gpu,
170+
out=out,
171+
f_number=float(params["fnumber"]),
172+
rx_start_s=float(params["t0"]),
173+
sampling_freq_hz=float(params["fs"]),
174+
sound_speed_m_s=sound_speed_m_s,
175+
modulation_freq_hz=float(params["fc"]),
176+
tukey_alpha=0.0,
177+
)
178+
return result
179+
180+
# Benchmark the function
181+
benchmark(mach_voxel_scaling)
182+
183+
# Verify basic properties (use the output array, not benchmark return value)
184+
expected_shape = (n_scan, n_frames)
185+
assert out.shape == expected_shape
186+
assert np.isfinite(cp.asnumpy(out)).all()
187+
188+
189+
@pytest.mark.benchmark(
190+
group="scaling_elements",
191+
min_time=0.1,
192+
max_time=1.0,
193+
min_rounds=3,
194+
warmup=True,
195+
warmup_iterations=1,
196+
)
197+
@pytest.mark.skipif(not HAS_CUPY, reason="CuPy not available")
198+
@pytest.mark.parametrize(
199+
"element_multiplier",
200+
[
201+
pytest.param(1, id="1x_elements"),
202+
pytest.param(2, id="2x_elements"),
203+
pytest.param(4, id="4x_elements"),
204+
pytest.param(8, id="8x_elements"),
205+
pytest.param(16, id="16x_elements"),
206+
pytest.param(32, id="32x_elements"),
207+
pytest.param(64, id="64x_elements"),
208+
],
209+
)
210+
def test_scaling_receive_elements(benchmark, base_scaling_data, element_multiplier):
211+
"""Test performance scaling with number of receive elements."""
212+
213+
data = base_scaling_data
214+
params = data["params"]
215+
216+
# Duplicate the element data and positions
217+
original_iq = data["iq_data"]
218+
original_positions = data["element_positions"]
219+
220+
# Tile the IQ data along the element axis
221+
scaled_iq = np.tile(original_iq, (element_multiplier, 1, 1))
222+
223+
# Create new element positions by duplicating and slightly offsetting
224+
n_original_elements = original_positions.shape[0]
225+
scaled_positions = np.zeros((n_original_elements * element_multiplier, 3), dtype=np.float32)
226+
227+
for i in range(element_multiplier):
228+
start_idx = i * n_original_elements
229+
end_idx = (i + 1) * n_original_elements
230+
scaled_positions[start_idx:end_idx] = original_positions.copy()
231+
# Keep identical positions - no offset needed
232+
233+
# Transfer to GPU
234+
iq_data_gpu = cp.asarray(scaled_iq)
235+
element_positions_gpu = cp.asarray(scaled_positions)
236+
scan_coords_gpu = cp.asarray(data["scan_coords_m"])
237+
transmit_arrivals_gpu = cp.asarray(data["transmit_arrivals_s"])
238+
239+
n_scan = scan_coords_gpu.shape[0]
240+
n_frames = iq_data_gpu.shape[2]
241+
out = cp.empty((n_scan, n_frames), dtype=cp.complex64)
242+
243+
def mach_element_scaling():
244+
"""mach GPU function for element scaling benchmark."""
245+
out[:] = 0.0
246+
result = nb_beamform(
247+
channel_data=iq_data_gpu,
248+
rx_coords_m=element_positions_gpu,
249+
scan_coords_m=scan_coords_gpu,
250+
tx_wave_arrivals_s=transmit_arrivals_gpu,
251+
out=out,
252+
f_number=float(params["fnumber"]),
253+
rx_start_s=float(params["t0"]),
254+
sampling_freq_hz=float(params["fs"]),
255+
sound_speed_m_s=float(params["c"]),
256+
modulation_freq_hz=float(params["fc"]),
257+
tukey_alpha=0.0,
258+
)
259+
return result
260+
261+
# Benchmark the function
262+
benchmark(mach_element_scaling)
263+
264+
# Verify basic properties (use the output array, not benchmark return value)
265+
expected_shape = (n_scan, n_frames)
266+
assert out.shape == expected_shape
267+
assert np.isfinite(cp.asnumpy(out)).all()
268+
269+
270+
@pytest.mark.benchmark(
271+
group="scaling_frames",
272+
min_time=0.1,
273+
max_time=1.0,
274+
min_rounds=3,
275+
warmup=True,
276+
warmup_iterations=1,
277+
)
278+
@pytest.mark.skipif(not HAS_CUPY, reason="CuPy not available")
279+
@pytest.mark.parametrize(
280+
"frame_multiplier",
281+
[
282+
pytest.param(1 / 32, id="1/32x_frames (1 frame)"),
283+
pytest.param(1 / 8, id="1/8x_frames (4 frames)"),
284+
pytest.param(1, id="1x_frames"),
285+
pytest.param(4, id="4x_frames"),
286+
pytest.param(16, id="16x_frames"),
287+
pytest.param(64, id="64x_frames"),
288+
],
289+
)
290+
def test_scaling_ensemble_size(benchmark, base_scaling_data, frame_multiplier):
291+
"""Test performance scaling with ensemble size (number of frames)."""
292+
293+
data = base_scaling_data
294+
params = data["params"]
295+
296+
# Duplicate the frame data
297+
original_iq = data["iq_data"]
298+
if frame_multiplier < 1:
299+
n_frames = round(original_iq.shape[2] * frame_multiplier)
300+
scaled_iq = original_iq[:, :, :n_frames]
301+
else:
302+
scaled_iq = np.tile(original_iq, (1, 1, frame_multiplier))
303+
304+
# Transfer to GPU
305+
iq_data_gpu = cp.asarray(scaled_iq)
306+
element_positions_gpu = cp.asarray(data["element_positions"])
307+
scan_coords_gpu = cp.asarray(data["scan_coords_m"])
308+
transmit_arrivals_gpu = cp.asarray(data["transmit_arrivals_s"])
309+
310+
n_scan = scan_coords_gpu.shape[0]
311+
n_frames = iq_data_gpu.shape[2]
312+
out = cp.empty((n_scan, n_frames), dtype=cp.complex64)
313+
314+
def mach_frame_scaling():
315+
"""mach GPU function for frame scaling benchmark."""
316+
out[:] = 0.0
317+
result = nb_beamform(
318+
channel_data=iq_data_gpu,
319+
rx_coords_m=element_positions_gpu,
320+
scan_coords_m=scan_coords_gpu,
321+
tx_wave_arrivals_s=transmit_arrivals_gpu,
322+
out=out,
323+
f_number=float(params["fnumber"]),
324+
rx_start_s=float(params["t0"]),
325+
sampling_freq_hz=float(params["fs"]),
326+
sound_speed_m_s=float(params["c"]),
327+
modulation_freq_hz=float(params["fc"]),
328+
tukey_alpha=0.0,
329+
)
330+
return result
331+
332+
# Benchmark the function
333+
benchmark(mach_frame_scaling)
334+
335+
# Verify basic properties (use the output array, not benchmark return value)
336+
expected_shape = (n_scan, n_frames)
337+
assert out.shape == expected_shape
338+
assert np.isfinite(cp.asnumpy(out)).all()

0 commit comments

Comments
 (0)