Q-Minh
diff --git a/‎bindings/pypbat/gpu/vbd/Vbd.cpp‎
Lines changed: 2 additions & 2 deletions b/‎bindings/pypbat/gpu/vbd/Vbd.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/pbat/gpu/vbd/CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions b/‎source/pbat/gpu/vbd/CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎source/pbat/gpu/vbd/InitializationStrategy.h‎
Lines changed: 0 additions & 14 deletions b/‎source/pbat/gpu/vbd/InitializationStrategy.h‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎source/pbat/gpu/vbd/Kernels.cu‎
Lines changed: 85 additions & 0 deletions b/‎source/pbat/gpu/vbd/Kernels.cu‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎source/pbat/gpu/vbd/Kernels.cuh‎
Lines changed: 59 additions & 0 deletions b/‎source/pbat/gpu/vbd/Kernels.cuh‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎source/pbat/gpu/vbd/Vbd.h‎
Lines changed: 5 additions & 2 deletions b/‎source/pbat/gpu/vbd/Vbd.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎source/pbat/gpu/vbd/VbdImpl.cu‎
Lines changed: 63 additions & 28 deletions b/‎source/pbat/gpu/vbd/VbdImpl.cu‎
Lines changed: 63 additions & 28 deletions
@@ -1,7 +1,7 @@
 #include "Vbd.h"
 
 #include <pbat/gpu/Aliases.h>
-#include <pbat/gpu/vbd/InitializationStrategy.h>
+#include <pbat/sim/vbd/Enums.h>
 #include <pbat/gpu/vbd/Vbd.h>
 #include <pbat/profiling/Profiling.h>
 #include <pybind11/eigen.h>
@@ -19,7 +19,7 @@ void Bind([[maybe_unused]] pybind11::module& m)
 #ifdef PBAT_USE_CUDA
 
     using namespace pbat;
-    using pbat::gpu::vbd::EInitializationStrategy;
+    using pbat::sim::vbd::EInitializationStrategy;
     using pbat::gpu::vbd::Vbd;
 
     pyb::enum_<EInitializationStrategy>(m, "InitializationStrategy")
 
@@ -2,21 +2,20 @@ target_sources(PhysicsBasedAnimationToolkit_PhysicsBasedAnimationToolkit
     PUBLIC
     FILE_SET api
     FILES
-    "InitializationStrategy.h"
     "Vbd.h"
 )
 target_sources(PhysicsBasedAnimationToolkit_PhysicsBasedAnimationToolkit
     PRIVATE
     FILE_SET implementation
     FILES
     "VbdImpl.cuh"
-    "VbdImplKernels.cuh"
+    "Kernels.cuh"
 )
 target_sources(PhysicsBasedAnimationToolkit_PhysicsBasedAnimationToolkit
     PRIVATE
     "Vbd.cu"
     "VbdImpl.cu"
-    "VbdImplKernels.cu"
+    "Kernels.cu"
 )
 
 add_subdirectory(tests)
@@ -0,0 +1,85 @@
+// clang-format off
+#include "pbat/gpu/DisableWarnings.h"
+// clang-format on
+
+#include "Kernels.cuh"
+#include "pbat/HostDevice.h"
+#include "pbat/physics/StableNeoHookeanEnergy.h"
+#include "pbat/sim/vbd/Kernels.h"
+
+namespace pbat {
+namespace gpu {
+namespace vbd {
+namespace kernels {
+
+__global__ void MinimizeBackwardEuler(BackwardEulerMinimization BDF)
+{
+    // Get thread info
+    extern __shared__ GpuScalar shared[];
+    auto tid              = threadIdx.x;
+    auto bid              = blockIdx.x;
+    auto nThreadsPerBlock = blockDim.x;
+    using namespace pbat::math::linalg::mini;
+
+    // Get vertex-tet adjacency information
+    GpuIndex i                 = BDF.partition[bid]; // Vertex index
+    GpuIndex GVTbegin          = BDF.GVTp[i];
+    GpuIndex nAdjacentElements = BDF.GVTp[i + 1] - GVTbegin;
+    // 1. Compute vertex-element elastic energy derivatives w.r.t. i and store them in shared memory
+    auto Hgt = FromFlatBuffer<3, 4>(shared, tid);
+    Hgt.SetZero();
+    auto Ht = Hgt.Slice<3, 3>(0, 0);
+    auto gt = Hgt.Col(3);
+    for (auto elocal = tid; elocal < nAdjacentElements; elocal += nThreadsPerBlock)
+    {
+        GpuIndex e                  = BDF.GVTn[GVTbegin + elocal];
+        GpuIndex ilocal             = BDF.GVTilocal[GVTbegin + elocal];
+        auto Te                     = FromBuffers<4, 1>(BDF.T, e);
+        auto GPe                    = FromFlatBuffer<4, 3>(BDF.GP, e);
+        auto xe                     = FromBuffers(BDF.x, Te.Transpose());
+        auto lamee                  = FromFlatBuffer<2, 1>(BDF.lame, e);
+        auto wg                     = BDF.wg[e];
+        SMatrix<GpuScalar, 3, 3> Fe = xe * GPe;
+        pbat::physics::StableNeoHookeanEnergy<3> Psi{};
+        SVector<GpuScalar, 9> gF;
+        SMatrix<GpuScalar, 9, 9> HF;
+        Psi.gradAndHessian(Fe, lamee(0), lamee(1), gF, HF);
+        using pbat::sim::vbd::kernels::AccumulateElasticGradient;
+        using pbat::sim::vbd::kernels::AccumulateElasticHessian;
+        AccumulateElasticHessian(ilocal, wg, GPe, HF, Ht);
+        AccumulateElasticGradient(ilocal, wg, GPe, gF, gt);
+    }
+    __syncthreads();
+
+    // Remaining execution is synchronous, i.e. only 1 thread is required
+    if (tid > 0)
+        return;
+
+    // 2. Compute total vertex hessian and gradient
+    SMatrix<GpuScalar, 3, 3> Hi = Zeros<GpuScalar, 3, 3>{};
+    SVector<GpuScalar, 3> gi    = Zeros<GpuScalar, 3, 1>{};
+    auto const nActiveThreads   = min(nAdjacentElements, nThreadsPerBlock);
+    for (auto j = 0; j < nActiveThreads; ++j)
+    {
+        auto Hgei = FromFlatBuffer<3, 4>(shared, j);
+        Hi += Hgei.Slice<3, 3>(0, 0);
+        gi += Hgei.Col(3);
+    }
+    GpuScalar mi                  = BDF.m[i];
+    SVector<GpuScalar, 3> xti     = FromBuffers<3, 1>(BDF.xt, i);
+    SVector<GpuScalar, 3> xitilde = FromBuffers<3, 1>(BDF.xtilde, i);
+    SVector<GpuScalar, 3> xi      = FromBuffers<3, 1>(BDF.x, i);
+    using pbat::sim::vbd::kernels::AddDamping;
+    using pbat::sim::vbd::kernels::AddInertiaDerivatives;
+    AddDamping(BDF.dt, xti, xi, BDF.kD, gi, Hi);
+    AddInertiaDerivatives(BDF.dt2, mi, xitilde, xi, gi, Hi);
+    // 4. Integrate positions
+    using pbat::sim::vbd::kernels::IntegratePositions;
+    IntegratePositions(gi, Hi, xi, BDF.detHZero);
+    ToBuffers(xi, BDF.x, i);
+};
+
+} // namespace kernels
+} // namespace vbd
+} // namespace gpu
+} // namespace pbat
@@ -0,0 +1,59 @@
+#ifndef PBAT_GPU_VBD_KERNELS_CUH
+#define PBAT_GPU_VBD_KERNELS_CUH
+
+#include "pbat/HostDevice.h"
+#include "pbat/gpu/Aliases.h"
+#include "pbat/math/linalg/mini/Mini.h"
+
+#include <array>
+#include <cstddef>
+#include <limits>
+
+namespace pbat {
+namespace gpu {
+namespace vbd {
+namespace kernels {
+
+struct BackwardEulerMinimization
+{
+    GpuScalar dt;                     ///< Time step
+    GpuScalar dt2;                    ///< Squared time step
+    GpuScalar* m;                     ///< Lumped mass matrix
+    std::array<GpuScalar*, 3> xtilde; ///< Inertial target
+    std::array<GpuScalar*, 3> xt;     ///< Previous vertex positions
+    std::array<GpuScalar*, 3> x;      ///< Vertex positions
+
+    std::array<GpuIndex*, 4> T; ///< 4x|#elements| array of tetrahedra
+    GpuScalar* wg;              ///< |#elements| array of quadrature weights
+    GpuScalar* GP;              ///< 4x3x|#elements| array of shape function gradients
+    GpuScalar* lame;            ///< 2x|#elements| of 1st and 2nd Lame coefficients
+    GpuScalar detHZero;         ///< Numerical zero for hessian determinant check
+    // GpuScalar const* kD;                  ///< |#elements| array of damping coefficients
+
+    GpuIndex* GVTp;      ///< Vertex-tetrahedron adjacency list's prefix sum
+    GpuIndex* GVTn;      ///< Vertex-tetrahedron adjacency list's neighbour list
+    GpuIndex* GVTilocal; ///< Vertex-tetrahedron adjacency list's ilocal property
+
+    GpuScalar kD;                             ///< Rayleigh damping coefficient
+    GpuScalar kC;                             ///< Collision penalty
+    GpuIndex nMaxCollidingTrianglesPerVertex; ///< Memory capacity for storing vertex triangle
+                                              ///< collision constraints
+    GpuIndex* FC; ///< |#vertices|x|nMaxCollidingTrianglesPerVertex| array of colliding triangles
+    GpuIndex* nCollidingTriangles; ///< |#vertices| array of the number of colliding triangles
+                                   ///< for each vertex.
+    std::array<GpuIndex*, 4> F;    ///< 3x|#collision triangles| array of triangles
+
+    GpuIndex*
+        partition; ///< List of vertex indices that can be processed independently, i.e. in parallel
+
+    auto constexpr ExpectedSharedMemoryPerThreadInBytes() const { return 12 * sizeof(GpuScalar); }
+};
+
+__global__ void MinimizeBackwardEuler(BackwardEulerMinimization BDF);
+
+} // namespace kernels
+} // namespace vbd
+} // namespace gpu
+} // namespace pbat
+
+#endif // PBAT_GPU_VBD_KERNELS_CUH
@@ -3,7 +3,7 @@
 
 #include "PhysicsBasedAnimationToolkitExport.h"
 #include "pbat/gpu/Aliases.h"
-#include "pbat/gpu/vbd/InitializationStrategy.h"
+#include "pbat/sim/vbd/Enums.h"
 
 namespace pbat {
 namespace gpu {
@@ -14,7 +14,10 @@ class VbdImpl;
 class Vbd
 {
   public:
-    PBAT_API Vbd(Eigen::Ref<GpuMatrixX const> const& X,
+    using EInitializationStrategy = pbat::sim::vbd::EInitializationStrategy;
+
+    PBAT_API
+    Vbd(Eigen::Ref<GpuMatrixX const> const& X,
         Eigen::Ref<GpuIndexMatrixX const> const& V,
         Eigen::Ref<GpuIndexMatrixX const> const& F,
         Eigen::Ref<GpuIndexMatrixX const> const& T);
 
@@ -3,8 +3,10 @@
 // clang-format on
 
 #include "VbdImpl.cuh"
-#include "VbdImplKernels.cuh"
+#include "Kernels.cuh"
 #include "pbat/gpu/common/Cuda.cuh"
+#include "pbat/math/linalg/mini/Mini.h"
+#include "pbat/sim/vbd/Kernels.h"
 
 #include <cuda/api.hpp>
 // #include <thrust/async/copy.h>
@@ -25,7 +27,7 @@ VbdImpl::VbdImpl(
       F(Fin),
       T(Tin),
       mPositionsAtT(Xin.cols()),
-      mKineticEnergyMinimalPositions(Xin.cols()),
+      mInertialTargetPositions(Xin.cols()),
       mChebyshevPositionsM2(Xin.cols()),
       mChebyshevPositionsM1(Xin.cols()),
       mVelocitiesAtT(Xin.cols()),
@@ -74,7 +76,7 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
     bdf.dt                              = sdt;
     bdf.dt2                             = sdt2;
     bdf.m                               = mMass.Raw();
-    bdf.xtilde                          = mKineticEnergyMinimalPositions.Raw();
+    bdf.xtilde                          = mInertialTargetPositions.Raw();
     bdf.xt                              = mPositionsAtT.Raw();
     bdf.x                               = X.x.Raw();
     bdf.T                               = T.inds.Raw();
@@ -98,6 +100,7 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
     mStream.device().make_current();
     for (auto s = 0; s < substeps; ++s)
     {
+        using namespace pbat::math::linalg::mini;
         // Store previous positions
         for (auto d = 0; d < X.x.Dimensions(); ++d)
         {
@@ -113,40 +116,56 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
             thrust::device.on(mStream.handle()),
             thrust::make_counting_iterator<GpuIndex>(0),
             thrust::make_counting_iterator<GpuIndex>(nVertices),
-            kernels::FKineticEnergyMinimum{
-                sdt,
-                sdt2,
-                X.x.Raw(),
-                mVelocities.Raw(),
-                mExternalAcceleration.Raw(),
-                mKineticEnergyMinimalPositions.Raw()});
+            [xt     = mPositionsAtT.Raw(),
+             vt     = mVelocities.Raw(),
+             aext   = mExternalAcceleration.Raw(),
+             xtilde = mInertialTargetPositions.Raw(),
+             dt     = sdt,
+             dt2    = sdt2] PBAT_DEVICE(auto i) {
+                using pbat::sim::vbd::kernels::InertialTarget;
+                auto y = InertialTarget(
+                    FromBuffers<3, 1>(xt, i),
+                    FromBuffers<3, 1>(vt, i),
+                    FromBuffers<3, 1>(aext, i),
+                    dt,
+                    dt2);
+                ToBuffers(y, xtilde, i);
+            });
         // Initialize block coordinate descent's, i.e. BCD's, solution
         e = thrust::async::for_each(
             thrust::device.on(mStream.handle()),
             thrust::make_counting_iterator<GpuIndex>(0),
             thrust::make_counting_iterator<GpuIndex>(nVertices),
-            kernels::FAdaptiveInitialization{
-                sdt,
-                sdt2,
-                mPositionsAtT.Raw(),
-                mVelocitiesAtT.Raw(),
-                mVelocities.Raw(),
-                mExternalAcceleration.Raw(),
-                X.x.Raw(),
-                mInitializationStrategy});
+            [xt       = mPositionsAtT.Raw(),
+             vtm1     = mVelocitiesAtT.Raw(),
+             vt       = mVelocities.Raw(),
+             aext     = mExternalAcceleration.Raw(),
+             x        = X.x.Raw(),
+             dt       = sdt,
+             dt2      = sdt2,
+             strategy = mInitializationStrategy] PBAT_DEVICE(auto i) {
+                using pbat::sim::vbd::kernels::InitialPositionsForSolve;
+                auto x0 = InitialPositionsForSolve(
+                    FromBuffers<3, 1>(xt, i),
+                    FromBuffers<3, 1>(vtm1, i),
+                    FromBuffers<3, 1>(vt, i),
+                    FromBuffers<3, 1>(aext, i),
+                    dt,
+                    dt2,
+                    strategy);
+                ToBuffers(x0, x, i);
+            });
         // Initialize Chebyshev semi-iterative method
-        kernels::FChebyshev fChebyshev{
-            rho,
-            mChebyshevPositionsM2.Raw(),
-            mChebyshevPositionsM1.Raw(),
-            X.x.Raw()};
+        GpuScalar rho2 = rho * rho;
+        GpuScalar omega{};
         auto kDynamicSharedMemoryCapacity = static_cast<cuda::memory::shared::size_t>(
             mGpuThreadBlockSize * bdf.ExpectedSharedMemoryPerThreadInBytes());
         // Minimize Backward Euler, i.e. BDF1, objective
         for (auto k = 0; k < iterations; ++k)
         {
+            using pbat::sim::vbd::kernels::ChebyshevOmega;
             if (bUseChebyshevAcceleration)
-                fChebyshev.SetIteration(k);
+                omega = ChebyshevOmega(k, rho2, omega);
 
             for (auto& partition : mPartitions)
             {
@@ -171,7 +190,17 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
                     thrust::device.on(mStream.handle()),
                     thrust::make_counting_iterator<GpuIndex>(0),
                     thrust::make_counting_iterator<GpuIndex>(nVertices),
-                    fChebyshev);
+                    [k     = k,
+                     omega = omega,
+                     xkm2  = mChebyshevPositionsM2.Raw(),
+                     xkm1  = mChebyshevPositionsM1.Raw(),
+                     xk    = X.x.Raw()] PBAT_DEVICE(auto i) {
+                        using pbat::sim::vbd::kernels::ChebyshevUpdate;
+                        auto xkm2i = FromBuffers<3, 1>(xkm2, i);
+                        auto xkm1i = FromBuffers<3, 1>(xkm1, i);
+                        auto xki   = FromBuffers<3, 1>(xk, i);
+                        ChebyshevUpdate(k, omega, xkm2i, xkm1i, xki);
+                    });
             }
         }
         // Update velocities
@@ -187,7 +216,13 @@ void VbdImpl::Step(GpuScalar dt, GpuIndex iterations, GpuIndex substeps, GpuScal
             thrust::device.on(mStream.handle()),
             thrust::make_counting_iterator<GpuIndex>(0),
             thrust::make_counting_iterator<GpuIndex>(nVertices),
-            kernels::FUpdateVelocity{sdt, mPositionsAtT.Raw(), X.x.Raw(), mVelocities.Raw()});
+            [xt = mPositionsAtT.Raw(), x = X.x.Raw(), v = mVelocities.Raw(), dt = dt] PBAT_DEVICE(
+                auto i) {
+                using pbat::sim::vbd::kernels::IntegrateVelocity;
+                auto vtp1 =
+                    IntegrateVelocity(FromBuffers<3, 1>(xt, i), FromBuffers<3, 1>(x, i), dt);
+                ToBuffers(vtp1, v, i);
+            });
     }
     mStream.synchronize();
 }
@@ -395,7 +430,7 @@ std::vector<common::Buffer<GpuIndex>> const& VbdImpl::GetPartitions() const
 #include <span>
 #include <vector>
 
-TEST_CASE("[gpu][xpbd] Xpbd")
+TEST_CASE("[gpu][vbd] Vbd")
 {
     using pbat::GpuIndex;
     using pbat::GpuIndexMatrixX;