Skip to content

Commit da328da

Browse files
committed
Use CPU-GPU agnostic kernels in GPU VBD integrator
1 parent 5b92480 commit da328da

File tree

11 files changed

+221
-386
lines changed

11 files changed

+221
-386
lines changed

bindings/pypbat/gpu/vbd/Vbd.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include "Vbd.h"
22

33
#include <pbat/gpu/Aliases.h>
4-
#include <pbat/gpu/vbd/InitializationStrategy.h>
4+
#include <pbat/sim/vbd/Enums.h>
55
#include <pbat/gpu/vbd/Vbd.h>
66
#include <pbat/profiling/Profiling.h>
77
#include <pybind11/eigen.h>
@@ -19,7 +19,7 @@ void Bind([[maybe_unused]] pybind11::module& m)
1919
#ifdef PBAT_USE_CUDA
2020

2121
using namespace pbat;
22-
using pbat::gpu::vbd::EInitializationStrategy;
22+
using pbat::sim::vbd::EInitializationStrategy;
2323
using pbat::gpu::vbd::Vbd;
2424

2525
pyb::enum_<EInitializationStrategy>(m, "InitializationStrategy")

source/pbat/gpu/vbd/CMakeLists.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,20 @@ target_sources(PhysicsBasedAnimationToolkit_PhysicsBasedAnimationToolkit
22
PUBLIC
33
FILE_SET api
44
FILES
5-
"InitializationStrategy.h"
65
"Vbd.h"
76
)
87
target_sources(PhysicsBasedAnimationToolkit_PhysicsBasedAnimationToolkit
98
PRIVATE
109
FILE_SET implementation
1110
FILES
1211
"VbdImpl.cuh"
13-
"VbdImplKernels.cuh"
12+
"Kernels.cuh"
1413
)
1514
target_sources(PhysicsBasedAnimationToolkit_PhysicsBasedAnimationToolkit
1615
PRIVATE
1716
"Vbd.cu"
1817
"VbdImpl.cu"
19-
"VbdImplKernels.cu"
18+
"Kernels.cu"
2019
)
2120

2221
add_subdirectory(tests)

source/pbat/gpu/vbd/InitializationStrategy.h

Lines changed: 0 additions & 14 deletions
This file was deleted.

source/pbat/gpu/vbd/Kernels.cu

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// clang-format off
2+
#include "pbat/gpu/DisableWarnings.h"
3+
// clang-format on
4+
5+
#include "Kernels.cuh"
6+
#include "pbat/HostDevice.h"
7+
#include "pbat/physics/StableNeoHookeanEnergy.h"
8+
#include "pbat/sim/vbd/Kernels.h"
9+
10+
namespace pbat {
11+
namespace gpu {
12+
namespace vbd {
13+
namespace kernels {
14+
15+
__global__ void MinimizeBackwardEuler(BackwardEulerMinimization BDF)
16+
{
17+
// Get thread info
18+
extern __shared__ GpuScalar shared[];
19+
auto tid = threadIdx.x;
20+
auto bid = blockIdx.x;
21+
auto nThreadsPerBlock = blockDim.x;
22+
using namespace pbat::math::linalg::mini;
23+
24+
// Get vertex-tet adjacency information
25+
GpuIndex i = BDF.partition[bid]; // Vertex index
26+
GpuIndex GVTbegin = BDF.GVTp[i];
27+
GpuIndex nAdjacentElements = BDF.GVTp[i + 1] - GVTbegin;
28+
// 1. Compute vertex-element elastic energy derivatives w.r.t. i and store them in shared memory
29+
auto Hgt = FromFlatBuffer<3, 4>(shared, tid);
30+
Hgt.SetZero();
31+
auto Ht = Hgt.Slice<3, 3>(0, 0);
32+
auto gt = Hgt.Col(3);
33+
for (auto elocal = tid; elocal < nAdjacentElements; elocal += nThreadsPerBlock)
34+
{
35+
GpuIndex e = BDF.GVTn[GVTbegin + elocal];
36+
GpuIndex ilocal = BDF.GVTilocal[GVTbegin + elocal];
37+
auto Te = FromBuffers<4, 1>(BDF.T, e);
38+
auto GPe = FromFlatBuffer<4, 3>(BDF.GP, e);
39+
auto xe = FromBuffers(BDF.x, Te.Transpose());
40+
auto lamee = FromFlatBuffer<2, 1>(BDF.lame, e);
41+
auto wg = BDF.wg[e];
42+
SMatrix<GpuScalar, 3, 3> Fe = xe * GPe;
43+
pbat::physics::StableNeoHookeanEnergy<3> Psi{};
44+
SVector<GpuScalar, 9> gF;
45+
SMatrix<GpuScalar, 9, 9> HF;
46+
Psi.gradAndHessian(Fe, lamee(0), lamee(1), gF, HF);
47+
using pbat::sim::vbd::kernels::AccumulateElasticGradient;
48+
using pbat::sim::vbd::kernels::AccumulateElasticHessian;
49+
AccumulateElasticHessian(ilocal, wg, GPe, HF, Ht);
50+
AccumulateElasticGradient(ilocal, wg, GPe, gF, gt);
51+
}
52+
__syncthreads();
53+
54+
// Remaining execution is synchronous, i.e. only 1 thread is required
55+
if (tid > 0)
56+
return;
57+
58+
// 2. Compute total vertex hessian and gradient
59+
SMatrix<GpuScalar, 3, 3> Hi = Zeros<GpuScalar, 3, 3>{};
60+
SVector<GpuScalar, 3> gi = Zeros<GpuScalar, 3, 1>{};
61+
auto const nActiveThreads = min(nAdjacentElements, nThreadsPerBlock);
62+
for (auto j = 0; j < nActiveThreads; ++j)
63+
{
64+
auto Hgei = FromFlatBuffer<3, 4>(shared, j);
65+
Hi += Hgei.Slice<3, 3>(0, 0);
66+
gi += Hgei.Col(3);
67+
}
68+
GpuScalar mi = BDF.m[i];
69+
SVector<GpuScalar, 3> xti = FromBuffers<3, 1>(BDF.xt, i);
70+
SVector<GpuScalar, 3> xitilde = FromBuffers<3, 1>(BDF.xtilde, i);
71+
SVector<GpuScalar, 3> xi = FromBuffers<3, 1>(BDF.x, i);
72+
using pbat::sim::vbd::kernels::AddDamping;
73+
using pbat::sim::vbd::kernels::AddInertiaDerivatives;
74+
AddDamping(BDF.dt, xti, xi, BDF.kD, gi, Hi);
75+
AddInertiaDerivatives(BDF.dt2, mi, xitilde, xi, gi, Hi);
76+
// 4. Integrate positions
77+
using pbat::sim::vbd::kernels::IntegratePositions;
78+
IntegratePositions(gi, Hi, xi, BDF.detHZero);
79+
ToBuffers(xi, BDF.x, i);
80+
};
81+
82+
} // namespace kernels
83+
} // namespace vbd
84+
} // namespace gpu
85+
} // namespace pbat

source/pbat/gpu/vbd/Kernels.cuh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#ifndef PBAT_GPU_VBD_KERNELS_CUH
2+
#define PBAT_GPU_VBD_KERNELS_CUH
3+
4+
#include "pbat/HostDevice.h"
5+
#include "pbat/gpu/Aliases.h"
6+
#include "pbat/math/linalg/mini/Mini.h"
7+
8+
#include <array>
9+
#include <cstddef>
10+
#include <limits>
11+
12+
namespace pbat {
13+
namespace gpu {
14+
namespace vbd {
15+
namespace kernels {
16+
17+
struct BackwardEulerMinimization
18+
{
19+
GpuScalar dt; ///< Time step
20+
GpuScalar dt2; ///< Squared time step
21+
GpuScalar* m; ///< Lumped mass matrix
22+
std::array<GpuScalar*, 3> xtilde; ///< Inertial target
23+
std::array<GpuScalar*, 3> xt; ///< Previous vertex positions
24+
std::array<GpuScalar*, 3> x; ///< Vertex positions
25+
26+
std::array<GpuIndex*, 4> T; ///< 4x|#elements| array of tetrahedra
27+
GpuScalar* wg; ///< |#elements| array of quadrature weights
28+
GpuScalar* GP; ///< 4x3x|#elements| array of shape function gradients
29+
GpuScalar* lame; ///< 2x|#elements| of 1st and 2nd Lame coefficients
30+
GpuScalar detHZero; ///< Numerical zero for hessian determinant check
31+
// GpuScalar const* kD; ///< |#elements| array of damping coefficients
32+
33+
GpuIndex* GVTp; ///< Vertex-tetrahedron adjacency list's prefix sum
34+
GpuIndex* GVTn; ///< Vertex-tetrahedron adjacency list's neighbour list
35+
GpuIndex* GVTilocal; ///< Vertex-tetrahedron adjacency list's ilocal property
36+
37+
GpuScalar kD; ///< Rayleigh damping coefficient
38+
GpuScalar kC; ///< Collision penalty
39+
GpuIndex nMaxCollidingTrianglesPerVertex; ///< Memory capacity for storing vertex triangle
40+
///< collision constraints
41+
GpuIndex* FC; ///< |#vertices|x|nMaxCollidingTrianglesPerVertex| array of colliding triangles
42+
GpuIndex* nCollidingTriangles; ///< |#vertices| array of the number of colliding triangles
43+
///< for each vertex.
44+
std::array<GpuIndex*, 4> F; ///< 3x|#collision triangles| array of triangles
45+
46+
GpuIndex*
47+
partition; ///< List of vertex indices that can be processed independently, i.e. in parallel
48+
49+
auto constexpr ExpectedSharedMemoryPerThreadInBytes() const { return 12 * sizeof(GpuScalar); }
50+
};
51+
52+
__global__ void MinimizeBackwardEuler(BackwardEulerMinimization BDF);
53+
54+
} // namespace kernels
55+
} // namespace vbd
56+
} // namespace gpu
57+
} // namespace pbat
58+
59+
#endif // PBAT_GPU_VBD_KERNELS_CUH

source/pbat/gpu/vbd/Vbd.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#include "PhysicsBasedAnimationToolkitExport.h"
55
#include "pbat/gpu/Aliases.h"
6-
#include "pbat/gpu/vbd/InitializationStrategy.h"
6+
#include "pbat/sim/vbd/Enums.h"
77

88
namespace pbat {
99
namespace gpu {
@@ -14,7 +14,10 @@ class VbdImpl;
1414
class Vbd
1515
{
1616
public:
17-
PBAT_API Vbd(Eigen::Ref<GpuMatrixX const> const& X,
17+
using EInitializationStrategy = pbat::sim::vbd::EInitializationStrategy;
18+
19+
PBAT_API
20+
Vbd(Eigen::Ref<GpuMatrixX const> const& X,
1821
Eigen::Ref<GpuIndexMatrixX const> const& V,
1922
Eigen::Ref<GpuIndexMatrixX const> const& F,
2023
Eigen::Ref<GpuIndexMatrixX const> const& T);

source/pbat/gpu/vbd/VbdImpl.cu

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
// clang-format on
44

55
#include "VbdImpl.cuh"
6-
#include "VbdImplKernels.cuh"
6+
#include "Kernels.cuh"
77
#include "pbat/gpu/common/Cuda.cuh"
8+
#include "pbat/math/linalg/mini/Mini.h"
9+
#include "pbat/sim/vbd/Kernels.h"
810

911
#include <cuda/api.hpp>
1012
// #include <thrust/async/copy.h>
@@ -25,7 +27,7 @@ VbdImpl::VbdImpl(
2527
F(Fin),
2628
T(Tin),
2729
mPositionsAtT(Xin.cols()),
28-
mKineticEnergyMinimalPositions(Xin.cols()),
30+
mInertialTargetPositions(Xin.cols()),
2931
mChebyshevPositionsM2(Xin.cols()),
3032
mChebyshevPositionsM1(Xin.cols()),
3133
mVelocitiesAtT(Xin.cols()),
@@ -74,7 +76,7 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
7476
bdf.dt = sdt;
7577
bdf.dt2 = sdt2;
7678
bdf.m = mMass.Raw();
77-
bdf.xtilde = mKineticEnergyMinimalPositions.Raw();
79+
bdf.xtilde = mInertialTargetPositions.Raw();
7880
bdf.xt = mPositionsAtT.Raw();
7981
bdf.x = X.x.Raw();
8082
bdf.T = T.inds.Raw();
@@ -98,6 +100,7 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
98100
mStream.device().make_current();
99101
for (auto s = 0; s < substeps; ++s)
100102
{
103+
using namespace pbat::math::linalg::mini;
101104
// Store previous positions
102105
for (auto d = 0; d < X.x.Dimensions(); ++d)
103106
{
@@ -113,40 +116,56 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
113116
thrust::device.on(mStream.handle()),
114117
thrust::make_counting_iterator<GpuIndex>(0),
115118
thrust::make_counting_iterator<GpuIndex>(nVertices),
116-
kernels::FKineticEnergyMinimum{
117-
sdt,
118-
sdt2,
119-
X.x.Raw(),
120-
mVelocities.Raw(),
121-
mExternalAcceleration.Raw(),
122-
mKineticEnergyMinimalPositions.Raw()});
119+
[xt = mPositionsAtT.Raw(),
120+
vt = mVelocities.Raw(),
121+
aext = mExternalAcceleration.Raw(),
122+
xtilde = mInertialTargetPositions.Raw(),
123+
dt = sdt,
124+
dt2 = sdt2] PBAT_DEVICE(auto i) {
125+
using pbat::sim::vbd::kernels::InertialTarget;
126+
auto y = InertialTarget(
127+
FromBuffers<3, 1>(xt, i),
128+
FromBuffers<3, 1>(vt, i),
129+
FromBuffers<3, 1>(aext, i),
130+
dt,
131+
dt2);
132+
ToBuffers(y, xtilde, i);
133+
});
123134
// Initialize block coordinate descent's, i.e. BCD's, solution
124135
e = thrust::async::for_each(
125136
thrust::device.on(mStream.handle()),
126137
thrust::make_counting_iterator<GpuIndex>(0),
127138
thrust::make_counting_iterator<GpuIndex>(nVertices),
128-
kernels::FAdaptiveInitialization{
129-
sdt,
130-
sdt2,
131-
mPositionsAtT.Raw(),
132-
mVelocitiesAtT.Raw(),
133-
mVelocities.Raw(),
134-
mExternalAcceleration.Raw(),
135-
X.x.Raw(),
136-
mInitializationStrategy});
139+
[xt = mPositionsAtT.Raw(),
140+
vtm1 = mVelocitiesAtT.Raw(),
141+
vt = mVelocities.Raw(),
142+
aext = mExternalAcceleration.Raw(),
143+
x = X.x.Raw(),
144+
dt = sdt,
145+
dt2 = sdt2,
146+
strategy = mInitializationStrategy] PBAT_DEVICE(auto i) {
147+
using pbat::sim::vbd::kernels::InitialPositionsForSolve;
148+
auto x0 = InitialPositionsForSolve(
149+
FromBuffers<3, 1>(xt, i),
150+
FromBuffers<3, 1>(vtm1, i),
151+
FromBuffers<3, 1>(vt, i),
152+
FromBuffers<3, 1>(aext, i),
153+
dt,
154+
dt2,
155+
strategy);
156+
ToBuffers(x0, x, i);
157+
});
137158
// Initialize Chebyshev semi-iterative method
138-
kernels::FChebyshev fChebyshev{
139-
rho,
140-
mChebyshevPositionsM2.Raw(),
141-
mChebyshevPositionsM1.Raw(),
142-
X.x.Raw()};
159+
GpuScalar rho2 = rho * rho;
160+
GpuScalar omega{};
143161
auto kDynamicSharedMemoryCapacity = static_cast<cuda::memory::shared::size_t>(
144162
mGpuThreadBlockSize * bdf.ExpectedSharedMemoryPerThreadInBytes());
145163
// Minimize Backward Euler, i.e. BDF1, objective
146164
for (auto k = 0; k < iterations; ++k)
147165
{
166+
using pbat::sim::vbd::kernels::ChebyshevOmega;
148167
if (bUseChebyshevAcceleration)
149-
fChebyshev.SetIteration(k);
168+
omega = ChebyshevOmega(k, rho2, omega);
150169

151170
for (auto& partition : mPartitions)
152171
{
@@ -171,7 +190,17 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
171190
thrust::device.on(mStream.handle()),
172191
thrust::make_counting_iterator<GpuIndex>(0),
173192
thrust::make_counting_iterator<GpuIndex>(nVertices),
174-
fChebyshev);
193+
[k = k,
194+
omega = omega,
195+
xkm2 = mChebyshevPositionsM2.Raw(),
196+
xkm1 = mChebyshevPositionsM1.Raw(),
197+
xk = X.x.Raw()] PBAT_DEVICE(auto i) {
198+
using pbat::sim::vbd::kernels::ChebyshevUpdate;
199+
auto xkm2i = FromBuffers<3, 1>(xkm2, i);
200+
auto xkm1i = FromBuffers<3, 1>(xkm1, i);
201+
auto xki = FromBuffers<3, 1>(xk, i);
202+
ChebyshevUpdate(k, omega, xkm2i, xkm1i, xki);
203+
});
175204
}
176205
}
177206
// Update velocities
@@ -187,7 +216,13 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
187216
thrust::device.on(mStream.handle()),
188217
thrust::make_counting_iterator<GpuIndex>(0),
189218
thrust::make_counting_iterator<GpuIndex>(nVertices),
190-
kernels::FUpdateVelocity{sdt, mPositionsAtT.Raw(), X.x.Raw(), mVelocities.Raw()});
219+
[xt = mPositionsAtT.Raw(), x = X.x.Raw(), v = mVelocities.Raw(), dt = dt] PBAT_DEVICE(
220+
auto i) {
221+
using pbat::sim::vbd::kernels::IntegrateVelocity;
222+
auto vtp1 =
223+
IntegrateVelocity(FromBuffers<3, 1>(xt, i), FromBuffers<3, 1>(x, i), dt);
224+
ToBuffers(vtp1, v, i);
225+
});
191226
}
192227
mStream.synchronize();
193228
}
@@ -395,7 +430,7 @@ std::vector<common::Buffer<GpuIndex>> const& VbdImpl::GetPartitions() const
395430
#include <span>
396431
#include <vector>
397432

398-
TEST_CASE("[gpu][xpbd] Xpbd")
433+
TEST_CASE("[gpu][vbd] Vbd")
399434
{
400435
using pbat::GpuIndex;
401436
using pbat::GpuIndexMatrixX;

0 commit comments

Comments
 (0)