Skip to content

Commit c98e4bb

Browse files
authored
SIMD implementations for particle to grid evaluation loops (#233)
* Prepare for benchmark of grid loop * Add benchmark * Start implementing NEON variant of dense grid loop * Implement NEON kernel * Improve remainder processing * Small optimizations * Fix limits of evaluation * Add benchmark data * Update bounds * Reduce code duplication * Reformulate kernel * Improve performance * Fix use of Neon features * Fix errors with Canyon benchmarks * Add AVX implementation * Use FMA * Auto dispatch for vectorization * Formatting * Use vectorization in surface reconstruction * Add CLI flags to enable SIMD * Show vectorization type in timings * Refactor, enable vectorization on 32bit as well * Refactoring, add some safety checks * Log detected SIMD features * Move code for SIMD kernels * Implement test for NEON cubic spline kernel * Fixes on x86 * Rename CLI arg for SIMD * Add AVX cubic kernel test * Py: Add SIMD arguments * Add serde-serialize feature, move code to benchmark * Update documentation * Update to main * Fix imports of Scalar * Reduce code duplication * Reduce code duplication dense/sparse * Build & test on more targets * Ignore kernel tests based on cfg * Fix cache keys * Add warning for SIMD + non-f32 * Specify minimum Rust version
1 parent 6b711a8 commit c98e4bb

26 files changed

+1534
-460
lines changed

.github/workflows/build.yml

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@ jobs:
2828
- uses: actions/checkout@v4
2929
with:
3030
lfs: true
31+
- name: Update rust toolchain
32+
run: |
33+
rustup update stable
34+
rustup default stable
35+
rustup --version
36+
- name: Cache rust dependencies
37+
uses: actions/cache@v4
38+
with:
39+
path: |
40+
~/.cargo/registry
41+
~/.cargo/git
42+
target
43+
key: ${{ runner.os }}-dev-cargo-${{ hashFiles('**/Cargo.lock') }}
3144
- name: Check (default members)
3245
run: cargo check
3346
- name: Check with examples
@@ -40,13 +53,30 @@ jobs:
4053
run: cargo test
4154

4255
build_release:
43-
name: Build & test release mode
44-
runs-on: ubuntu-latest
56+
name: Build & test release mode (${{ matrix.runner }})
57+
runs-on: ${{ matrix.runner }}
4558
needs: build_workspace
59+
strategy:
60+
fail-fast: false
61+
matrix:
62+
runner: [ubuntu-latest, ubuntu-24.04-arm, macos-14, macos-13, windows-latest]
4663
steps:
4764
- uses: actions/checkout@v4
4865
with:
4966
lfs: true
67+
- name: Update rust toolchain
68+
run: |
69+
rustup update stable
70+
rustup default stable
71+
rustup --version
72+
- name: Cache rust dependencies
73+
uses: actions/cache@v4
74+
with:
75+
path: |
76+
~/.cargo/registry
77+
~/.cargo/git
78+
target
79+
key: ${{ matrix.runner }}-release-cargo-${{ hashFiles('**/Cargo.lock') }}
5080
- name: Build (release)
5181
run: cargo build --release
5282
- name: Test (release)
@@ -59,6 +89,19 @@ jobs:
5989
- uses: actions/checkout@v4
6090
with:
6191
lfs: true
92+
- name: Update rust toolchain
93+
run: |
94+
rustup update stable
95+
rustup default stable
96+
rustup --version
97+
- name: Cache rust dependencies
98+
uses: actions/cache@v4
99+
with:
100+
path: |
101+
~/.cargo/registry
102+
~/.cargo/git
103+
target
104+
key: ${{ runner.os }}-lib-all-cargo-${{ hashFiles('**/Cargo.lock') }}
62105
- name: Build
63106
run: |
64107
cargo build --manifest-path="splashsurf_lib/Cargo.toml" --all-targets --all-features
@@ -76,6 +119,19 @@ jobs:
76119
- uses: actions/checkout@v4
77120
with:
78121
lfs: true
122+
- name: Update rust toolchain
123+
run: |
124+
rustup update stable
125+
rustup default stable
126+
rustup --version
127+
- name: Cache rust dependencies
128+
uses: actions/cache@v4
129+
with:
130+
path: |
131+
~/.cargo/registry
132+
~/.cargo/git
133+
target
134+
key: ${{ runner.os }}-lib-no-default-cargo-${{ hashFiles('**/Cargo.lock') }}
79135
- name: Build
80136
run: |
81137
cargo build --manifest-path="splashsurf_lib/Cargo.toml" --all-targets --no-default-features
@@ -87,9 +143,14 @@ jobs:
87143
name: Publish to crates.io
88144
runs-on: ubuntu-latest
89145
if: ${{ startsWith(github.ref, 'refs/tags/v') || (github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') }}
90-
needs: [check_format, build_workspace, build_lib_all_features, build_lib_no_default_features]
146+
needs: [check_format, build_workspace, build_lib_all_features, build_lib_no_default_features, build_release]
91147
steps:
92148
- uses: actions/checkout@v4
149+
- name: Update rust toolchain
150+
run: |
151+
rustup update stable
152+
rustup default stable
153+
rustup --version
93154
- name: Publish splashsurf_lib
94155
run: |
95156
cargo publish --package splashsurf_lib

Cargo.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

data/density_grid_loop_subdomain_33.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pysplashsurf/src/pipeline.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ use crate::utils::{IndexT, pyerr_unsupported_scalar};
4848
/// Upper corner [x,y,z] of the AABB of particles to consider in the reconstruction.
4949
/// multi_threading
5050
/// Flag to enable multi-threading for the reconstruction and post-processing steps.
51+
/// simd
52+
/// Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
5153
/// subdomain_grid
5254
/// Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading.
5355
/// subdomain_grid_auto_disable
@@ -108,7 +110,7 @@ use crate::utils::{IndexT, pyerr_unsupported_scalar};
108110
#[pyo3(name = "reconstruction_pipeline")]
109111
#[pyo3(signature = (particles, *, attributes_to_interpolate = None,
110112
particle_radius, rest_density = 1000.0, smoothing_length, cube_size, iso_surface_threshold = 0.6,
111-
aabb_min = None, aabb_max = None, multi_threading = true,
113+
aabb_min = None, aabb_max = None, multi_threading = true, simd = true,
112114
subdomain_grid = true, subdomain_grid_auto_disable = true, subdomain_num_cubes_per_dim = 64,
113115
check_mesh_closed = false, check_mesh_manifold = false, check_mesh_orientation = false, check_mesh_debug = false,
114116
mesh_cleanup = false, mesh_cleanup_snap_dist = None, decimate_barnacles = false, keep_vertices = false, compute_normals = false, sph_normals = false,
@@ -128,6 +130,7 @@ pub fn reconstruction_pipeline<'py>(
128130
aabb_min: Option<[f64; 3]>,
129131
aabb_max: Option<[f64; 3]>,
130132
multi_threading: bool,
133+
simd: bool,
131134
subdomain_grid: bool,
132135
subdomain_grid_auto_disable: bool,
133136
subdomain_num_cubes_per_dim: u32,
@@ -192,6 +195,7 @@ pub fn reconstruction_pipeline<'py>(
192195
iso_surface_threshold,
193196
particle_aabb,
194197
enable_multi_threading: multi_threading,
198+
enable_simd: simd,
195199
spatial_decomposition,
196200
global_neighborhood_list: false,
197201
};

pysplashsurf/src/reconstruction.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ impl PySurfaceReconstruction {
122122
/// Upper corner of the AABB of particles to consider in the reconstruction.
123123
/// multi_threading
124124
/// Flag to enable multi-threading for the reconstruction and post-processing steps.
125+
/// simd
126+
/// Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
125127
/// subdomain_grid
126128
/// Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading.
127129
/// subdomain_grid_auto_disable
@@ -134,7 +136,7 @@ impl PySurfaceReconstruction {
134136
#[pyo3(signature = (particles, *,
135137
particle_radius, rest_density = 1000.0, smoothing_length, cube_size, iso_surface_threshold = 0.6,
136138
aabb_min = None, aabb_max = None,
137-
multi_threading = true, global_neighborhood_list = false,
139+
multi_threading = true, simd = true, global_neighborhood_list = false,
138140
subdomain_grid = true, subdomain_grid_auto_disable = true, subdomain_num_cubes_per_dim = 64
139141
))]
140142
pub fn reconstruct_surface<'py>(
@@ -147,6 +149,7 @@ pub fn reconstruct_surface<'py>(
147149
aabb_min: Option<[f64; 3]>,
148150
aabb_max: Option<[f64; 3]>,
149151
multi_threading: bool,
152+
simd: bool,
150153
global_neighborhood_list: bool,
151154
subdomain_grid: bool,
152155
subdomain_grid_auto_disable: bool,
@@ -175,6 +178,7 @@ pub fn reconstruct_surface<'py>(
175178
iso_surface_threshold,
176179
particle_aabb,
177180
enable_multi_threading: multi_threading,
181+
enable_simd: simd,
178182
spatial_decomposition,
179183
global_neighborhood_list,
180184
};

splashsurf/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ keywords = ["sph", "particle", "surface", "reconstruction", "marching-cubes"]
66
categories = ["command-line-utilities", "graphics", "science", "simulation", "visualization"]
77
readme = "README.md"
88
edition = "2024"
9+
rust-version = "1.88"
910

1011
authors.workspace = true
1112
license.workspace = true

splashsurf/src/reconstruct.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,17 @@ pub(crate) struct ReconstructSubcommandArgs {
124124
/// Set the number of threads for the worker thread pool
125125
#[arg(help_heading = ARGS_ADV, long, short = 'n')]
126126
pub num_threads: Option<usize>,
127+
/// Enable vectorization of some computations using SIMD instructions (requires CPU with AVX2 or NEON support).
128+
/// Note that vectorization is currently only available in single precision (f32) mode.
129+
#[arg(
130+
help_heading = ARGS_ADV,
131+
long,
132+
default_value = "on",
133+
value_name = "off|on",
134+
ignore_case = true,
135+
require_equals = true
136+
)]
137+
pub simd: Switch,
127138

128139
/// Enable automatic spatial decomposition using a regular grid-based approach (for efficient multithreading) if the domain is large enough
129140
#[arg(
@@ -637,6 +648,7 @@ pub(crate) mod arguments {
637648
iso_surface_threshold: args.surface_threshold,
638649
particle_aabb,
639650
enable_multi_threading: args.parallelize_over_particles.into_bool(),
651+
enable_simd: args.simd.into_bool(),
640652
spatial_decomposition,
641653
global_neighborhood_list: args.mesh_smoothing_weights.into_bool(),
642654
};

splashsurf_lib/Cargo.toml

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ keywords = ["sph", "particle", "surface", "reconstruction", "marching-cubes"]
66
categories = ["graphics", "science", "simulation", "visualization", "rendering"]
77
readme = "README.md"
88
edition = "2024"
9+
rust-version = "1.88"
910

1011
documentation = "https://docs.rs/splashsurf_lib"
1112
authors.workspace = true
@@ -17,9 +18,15 @@ repository.workspace = true
1718
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features --no-deps --open
1819
# Build with all features to properly document everything
1920
all-features = true
20-
# Build only for a single target as this crate does not have any platform specific behavior
21-
default-target = "x86_64-unknown-linux-gnu"
22-
targets = []
21+
targets = [
22+
"x86_64-unknown-linux-gnu",
23+
"i686-unknown-linux-gnu",
24+
"aarch64-unknown-linux-gnu",
25+
"aarch64-apple-darwin",
26+
"x86_64-apple-darwin",
27+
"x86_64-pc-windows-msvc",
28+
"i686-pc-windows-msvc"
29+
]
2330

2431
# Ignore the tests (especially the test mesh files) for publishing
2532
exclude = ["tests/*", "benches/*"]
@@ -29,6 +36,7 @@ default = []
2936
vtk_extras = ["vtkio"]
3037
profiling = []
3138
io = ["vtk_extras", "vtkio", "ply-rs", "nom", "serde_json", "flate2"]
39+
serde-serialize = ["serde", "serde_derive", "serde_json", "nalgebra/serde-serialize"]
3240

3341
[dependencies]
3442
log = "0.4"
@@ -59,6 +67,10 @@ flate2 = { version = "1.0", optional = true }
5967
nom = { version = "8.0", optional = true }
6068
serde_json = { version = "1.0", optional = true }
6169

70+
# Serialization
71+
serde = { version = "1.0", optional = true }
72+
serde_derive = { version = "1.0", optional = true }
73+
6274
[dev-dependencies]
6375
criterion = "0.7"
6476
ultraviolet = "0.10"
@@ -83,4 +95,4 @@ required-features = ["profiling", "io"]
8395
name = "splashsurf_lib_benches"
8496
path = "benches/splashsurf_lib_benches.rs"
8597
harness = false
86-
required-features = ["io"]
98+
required-features = ["io", "serde-serialize"]

splashsurf_lib/benches/benches/bench_full.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ pub fn surface_reconstruction_dam_break(c: &mut Criterion) {
102102
iso_surface_threshold: 0.6,
103103
particle_aabb: None,
104104
enable_multi_threading: true,
105+
enable_simd: true,
105106
spatial_decomposition: SpatialDecomposition::None,
106107
global_neighborhood_list: false,
107108
};
@@ -161,6 +162,7 @@ pub fn surface_reconstruction_double_dam_break(c: &mut Criterion) {
161162
iso_surface_threshold: 0.6,
162163
particle_aabb: None,
163164
enable_multi_threading: true,
165+
enable_simd: true,
164166
spatial_decomposition: SpatialDecomposition::None,
165167
global_neighborhood_list: false,
166168
};
@@ -220,6 +222,7 @@ pub fn surface_reconstruction_double_dam_break_inplace(c: &mut Criterion) {
220222
iso_surface_threshold: 0.6,
221223
particle_aabb: None,
222224
enable_multi_threading: true,
225+
enable_simd: true,
223226
spatial_decomposition: SpatialDecomposition::None,
224227
global_neighborhood_list: false,
225228
};

0 commit comments

Comments
 (0)