Fixes and improvements

Cydral · Cydral · commit ab29fc41346f · 2025-09-13T18:31:33.000+02:00
diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp
@@ -3238,7 +3238,6 @@ namespace dlib
             const long d_model = feature_dim / input_data.k();
             const long num_channels = input_data.k();
 
-            #pragma omp parallel for
             for (long pos = 0; pos < batch_size * seq_len; ++pos) {
                 const long n = pos / seq_len;
                 const long s = pos % seq_len;
@@ -3281,7 +3280,6 @@ namespace dlib
             float* remain = remainders.host();
             float* steps = n_steps.host();
 
-            #pragma omp parallel for
             for (long pos = 0; pos < batch_size * seq_len; ++pos) {
                 if (cum_halt[pos] < halt_threshold) {
                     const long n = pos / seq_len;
@@ -3319,7 +3317,6 @@ namespace dlib
             const float* remain = remainders.host();
             float* out_ptr = output.host();
 
-            #pragma omp parallel for
             for (long pos = 0; pos < batch_size * seq_len; ++pos) {
                 float r = remain[pos];
                 if (r > 1e-6f) {
@@ -3350,7 +3347,6 @@ namespace dlib
             const float* steps = n_steps.host();
             float* grad_ptr = gradients.host();
 
-            #pragma omp parallel for
             for (long pos = 0; pos < batch_size * seq_len; ++pos)
             {
                 const float scale = 1.0f + scale_factor * (steps[pos] / max_steps);
diff --git a/dlib/cuda/cuda_dlib.cu b/dlib/cuda/cuda_dlib.cu
@@ -2749,23 +2749,35 @@ namespace dlib
             size_t feature_dim
         )
         {
-            for (auto pos : grid_stride_range(0, batch_size * seq_len))
+            const long total_positions = batch_size * seq_len;
+
+            for (auto pos : grid_stride_range_y(0, total_positions))
+                for (auto i : grid_stride_range(0, 1))
+                    logits[pos] = b_halt;
+            __syncthreads();
+
+            for (auto pos : grid_stride_range_y(0, total_positions))
             {
-                const size_t n = pos / seq_len;
-                const size_t s = pos % seq_len;
+                const long n = pos / seq_len;
+                const long s = pos % seq_len;
 
-                float logit = b_halt;
+                float temp = 0;
+                for (auto feat_idx : grid_stride_range(0, feature_dim))
+                {
+                    const long c = feat_idx / d_model;
+                    const long d = feat_idx % d_model;
 
-                for (size_t c = 0; c < num_channels; ++c) {
-                    for (size_t d = 0; d < d_model; ++d) {
-                        const size_t in_idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
-                        const size_t weight_idx = c * d_model + d;
-                        logit += input_data[in_idx] * W_halt[weight_idx];
-                    }
+                    const long in_idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
+                    temp += input_data[in_idx] * W_halt[feat_idx];
                 }
 
-                logits[pos] = logit;
-                halt_probs[pos] = 1.0f / (1.0f + ::expf(-logit));
+                warp_reduce_atomic_add(logits[pos], temp);
+            }
+            __syncthreads();
+
+            for (auto pos : grid_stride_range(0, total_positions))
+            {
+                halt_probs[pos] = 1.0f / (1.0f + expf(-logits[pos]));
             }
         }
 
@@ -2783,8 +2795,11 @@ namespace dlib
             const long d_model = feature_dim / input_data.k();
             const long num_channels = input_data.k();
 
+            halt_probs.set_size(total_positions, 1, 1, 1);
+            logits.set_size(total_positions, 1, 1, 1);
+
             launch_kernel(_cuda_compute_act_halt_probabilities,
-                max_jobs(total_positions),
+                max_jobs(feature_dim, total_positions),
                 halt_probs.device(),
                 logits.device(),
                 input_data.device(),
@@ -2814,7 +2829,8 @@ namespace dlib
         {
             for (auto pos : grid_stride_range(0, batch_size * seq_len))
             {
-                if (cumulative_halting[pos] < halt_threshold) {
+                if (cumulative_halting[pos] < halt_threshold)
+                {
                     const size_t n = pos / seq_len;
                     const size_t s = pos % seq_len;
 
@@ -2930,17 +2946,21 @@ namespace dlib
             float scale_factor
         )
         {
-            for (auto pos : grid_stride_range(0, batch_size * seq_len))
+            const long total_positions = batch_size * seq_len;
+            const long feature_dim = num_channels * d_model;
+
+            for (auto pos : grid_stride_range_y(0, total_positions))
             {
+                const long n = pos / seq_len;
+                const long s = pos % seq_len;
                 const float scale = 1.0f + scale_factor * (n_steps[pos] / max_steps);
-                const size_t n = pos / seq_len;
-                const size_t s = pos % seq_len;
 
-                for (size_t c = 0; c < num_channels; ++c) {
-                    for (size_t d = 0; d < d_model; ++d) {
-                        const size_t idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
-                        gradients[idx] *= scale;
-                    }
+                for (auto feat_idx : grid_stride_range(0, feature_dim))
+                {
+                    const long c = feat_idx / d_model;
+                    const long d = feat_idx % d_model;
+                    const long idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
+                    gradients[idx] *= scale;
                 }
             }
         }
@@ -2956,8 +2976,11 @@ namespace dlib
             float scale_factor
         )
         {
+            const long total_positions = batch_size * seq_len;
+            const long feature_dim = num_channels * d_model;
+
             launch_kernel(_cuda_apply_act_depth_scaling,
-                max_jobs(batch_size * seq_len),
+                max_jobs(feature_dim, total_positions),
                 gradients.device(),
                 n_steps.device(),
                 batch_size,
diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
@@ -5720,7 +5720,7 @@ namespace dlib
             max_steps_(max_steps),
             halt_threshold_(0.99f),     // theta in Graves' notation
             ponder_penalty_(0.01f),     // lambda (ponder cost weight)
-            enable_depth_scaling_(false),
+            enable_depth_scaling_(true),
             batch_size_(0),
             seq_len_(0),
             d_model_(0),
@@ -5857,8 +5857,7 @@ namespace dlib
                     halting_probs_, logits_, input, params,
                     batch_size_, seq_len_, feature_dim_);
 
-                // CRITICAL: Capture effective weights before state update
-                // This ensures numerical precision in backward pass
+                // Capture effective weights before state update
                 const float* p_halt = halting_probs_.host();
                 const float* cum_halt = cum_halt_ptr;
                 const float* remainders = remainders_ptr;
@@ -5911,10 +5910,6 @@ namespace dlib
         void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) {                     
             tensor& input_grad = sub.get_gradient_input();
 
-            // Propagate gradients to input using instrumented effective weights
-            // This approach ensures numerical precision by using the exact weights
-            // computed during the forward pass, avoiding reconstruction errors
-
             const float* grad_in = gradient_input.host();
             const float* eff_weights = true_effective_weights_.host();
             float* grad_out = input_grad.host();