davisking
diff --git a/‎dlib/cuda/cpu_dlib.h‎
Lines changed: 65 additions & 59 deletions b/‎dlib/cuda/cpu_dlib.h‎
Lines changed: 65 additions & 59 deletions
diff --git a/‎dlib/cuda/cuda_dlib.cu‎
Lines changed: 46 additions & 35 deletions b/‎dlib/cuda/cuda_dlib.cu‎
Lines changed: 46 additions & 35 deletions
diff --git a/‎dlib/cuda/cuda_dlib.h‎
Lines changed: 38 additions & 42 deletions b/‎dlib/cuda/cuda_dlib.h‎
Lines changed: 38 additions & 42 deletions
@@ -772,88 +772,94 @@ namespace dlib
 
     class compute_loss_cross_entropy_per_logit
     {
-        /*! The point of this class is to compute the loss for loss_cross_entropy_per_logit_
-            on the cpu to provide an analogous implementation of the cuda version
+        /*!
+            Computes cross-entropy loss for causal language modeling
+            Uses all sequence positions (except last) for training
+            Each position t predicts the token at position t+1
         !*/
     public:
-        compute_loss_cross_entropy_per_logit()
-        {
-        }
-
+        compute_loss_cross_entropy_per_logit() {}
+    
         template <typename const_label_iterator>
         void operator()(
             const_label_iterator truth,
+            const tensor& input_tensor,
             const tensor& output_tensor,
             tensor& grad,
             double& loss
         ) const
         {
-            DLIB_CASSERT(output_tensor.k() == 1,
-                "output_tensor.k() = " << output_tensor.k());
-
+            DLIB_CASSERT(output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.nc() == 1);
+        
             const long batch_size = output_tensor.num_samples();
             const long seq_len = output_tensor.nr();
             const long vocab_size = output_tensor.nc();
-
-            // The loss we output is the average loss over the mini-batch
-            const double scale = 1.0 / batch_size;
+        
+            // Normalization over all positions
+            const double scale = 1.0 / (batch_size * seq_len);
+        
             loss = 0.0;
-
             const float* out_data = output_tensor.host();
+            const float* in_data = input_tensor.host();
             float* g = grad.host();
-
-            // Zero out all gradients first. Gradients will only be non-zero at the
-            // last position (seq_len-1) of each sequence where the loss is computed
+        
             std::fill(g, g + grad.size(), 0.0f);
-
-            // Compute loss and gradients only for the last position of each sequence.
-            // This implements the standard next token prediction objective used in
-            // autoregressive language models
+        
             for (long i = 0; i < batch_size; ++i)
             {
-                const unsigned long target_class = *(truth + i);
-
-                // The network must produce a number of outputs that is equal to the number
-                // of labels when using this type of loss
-                DLIB_CASSERT(target_class < static_cast<unsigned long>(vocab_size),
-                    "target_class: " << target_class << ", vocab_size: " << vocab_size);
-
-                // Compute softmax for numerical stability using the log-sum-exp trick.
-                // First, find the maximum value for this position to prevent overflow
-                float max_val = out_data[tensor_index(output_tensor, i, 0, seq_len - 1, 0)];
-                for (long c = 1; c < vocab_size; ++c)
-                {
-                    const float val = out_data[tensor_index(output_tensor, i, 0, seq_len - 1, c)];
-                    max_val = std::max(max_val, val);
-                }
-
-                // Compute exp(x - max) and sum for the softmax denominator
-                float sum_exp = 0;
-                for (long c = 0; c < vocab_size; ++c)
-                {
-                    const unsigned long idx = tensor_index(output_tensor, i, 0, seq_len - 1, c);
-                    const float exp_val = std::exp(out_data[idx] - max_val);
-                    g[idx] = exp_val; // Temporarily store exp values
-                    sum_exp += exp_val;
-                }
-
-                // Normalize to get softmax probabilities, compute loss, and set gradients
-                for (long c = 0; c < vocab_size; ++c)
+                // Loop over all positions (0 to seq_len-1)
+                for (long t = 0; t < seq_len; ++t)
                 {
-                    const unsigned long idx = tensor_index(output_tensor, i, 0, seq_len - 1, c);
-                    const float softmax_val = g[idx] / sum_exp;
-
-                    if (static_cast<unsigned long>(c) == target_class)
+                    unsigned long target_class;
+                
+                    // Extract target token
+                    if (t < seq_len - 1) {
+                        // For positions 0 to seq_len-2: target from input_tensor[t+1]
+                        target_class = static_cast<unsigned long>(
+                            in_data[tensor_index(input_tensor, i, 0, t + 1, 0)]
+                        );
+                    } else {
+                        // For last position (seq_len-1): target from truth
+                        target_class = *(truth + i);
+                    }
+                
+                    DLIB_CASSERT(target_class < static_cast<unsigned long>(vocab_size));
+                
+                    // Find max logit for numerical stability
+                    float max_val = out_data[tensor_index(output_tensor, i, 0, t, 0)];
+                    for (long c = 1; c < vocab_size; ++c)
                     {
-                        // Cross-entropy loss: -log(p(target_class))
-                        loss += scale * (-std::log(std::max(softmax_val, 1e-10f)));
-                        // Gradient for the target class: scale * (p - 1)
-                        g[idx] = scale * (softmax_val - 1.0f);
+                        const float val = out_data[tensor_index(output_tensor, i, 0, t, c)];
+                        max_val = std::max(max_val, val);
                     }
-                    else
+                
+                    // Compute softmax denominator
+                    float sum_exp = 0.0f;
+                    for (long c = 0; c < vocab_size; ++c)
                     {
-                        // Gradient for non-target classes: scale * p
-                        g[idx] = scale * softmax_val;
+                        const unsigned long idx = tensor_index(output_tensor, i, 0, t, c);
+                        const float exp_val = std::exp(out_data[idx] - max_val);
+                        g[idx] = exp_val;
+                        sum_exp += exp_val;
+                    }
+                
+                    // Compute loss and gradients
+                    for (long c = 0; c < vocab_size; ++c)
+                    {
+                        const unsigned long idx = tensor_index(output_tensor, i, 0, t, c);
+                        const float softmax_val = g[idx] / sum_exp;
+                    
+                        if (static_cast<unsigned long>(c) == target_class)
+                        {
+                            loss += scale * (-std::log(std::max(softmax_val, 1e-10f)));
+                            g[idx] = scale * (softmax_val - 1.0f);
+                        }
+                        else
+                        {
+                            g[idx] = scale * softmax_val;
+                        }
                     }
                 }
             }
 
@@ -3114,6 +3114,7 @@ namespace dlib
             float* loss_out,
             float* g,
             const unsigned long* truth,
+            const float* input_data,
             const float* out_data,
             size_t batch_size,
             size_t seq_len,
@@ -3125,54 +3126,63 @@ namespace dlib
 
             for (auto sample_idx : grid_stride_range(0, batch_size))
             {
-                const unsigned long target_class = truth[sample_idx];
-
-                const size_t last_pos = seq_len - 1;
-
-                float max_val = out_data[sample_idx * seq_len * vocab_size + last_pos * vocab_size + 0];
-                for (size_t c = 1; c < vocab_size; ++c)
+                for (size_t t = 0; t < seq_len; ++t)
                 {
-                    const size_t idx = sample_idx * seq_len * vocab_size + last_pos * vocab_size + c;
-                    max_val = ::max(max_val, out_data[idx]);
-                }
+                    unsigned long target_class;
 
-                float sum_exp = 0.0f;
-                for (size_t c = 0; c < vocab_size; ++c)
-                {
-                    const size_t idx = sample_idx * seq_len * vocab_size + last_pos * vocab_size + c;
-                    const float exp_val = ::exp(out_data[idx] - max_val);
-                    g[idx] = exp_val;
-                    sum_exp += exp_val;
-                }
+                    if (t < seq_len - 1) {
+                        const size_t input_idx = sample_idx * seq_len + (t + 1);
+                        target_class = static_cast<unsigned long>(input_data[input_idx]);
+                    }
+                    else {
+                        target_class = truth[sample_idx];
+                    }
 
-                for (size_t c = 0; c < vocab_size; ++c)
-                {
-                    const size_t idx = sample_idx * seq_len * vocab_size + last_pos * vocab_size + c;
-                    const float softmax_val = g[idx] / sum_exp;
+                    const size_t base_idx = sample_idx * seq_len * vocab_size + t * vocab_size;
+                    float max_val = out_data[base_idx + 0];
+                    for (size_t c = 1; c < vocab_size; ++c)
+                    {
+                        max_val = ::max(max_val, out_data[base_idx + c]);
+                    }
 
-                    if (c == target_class)
+                    float sum_exp = 0.0f;
+                    for (size_t c = 0; c < vocab_size; ++c)
                     {
-                        total_loss += -::log(::max(softmax_val, 1e-10f));
-                        g[idx] = scale * (softmax_val - 1.0f);
+                        const size_t idx = base_idx + c;
+                        const float exp_val = ::exp(out_data[idx] - max_val);
+                        g[idx] = exp_val;
+                        sum_exp += exp_val;
                     }
-                    else
+
+                    for (size_t c = 0; c < vocab_size; ++c)
                     {
-                        g[idx] = scale * softmax_val;
+                        const size_t idx = base_idx + c;
+                        const float softmax_val = g[idx] / sum_exp;
+
+                        if (c == target_class)
+                        {
+                            total_loss += -::log(::max(softmax_val, 1e-10f));
+                            g[idx] = scale * (softmax_val - 1.0f);
+                        }
+                        else
+                        {
+                            g[idx] = scale * softmax_val;
+                        }
                     }
                 }
             }
 
             warp_reduce_atomic_add(*loss_out, total_loss);
         }
 
-        void compute_loss_cross_entropy_per_logit::
-            do_work(
-                cuda_data_ptr<float> loss_work_buffer,
-                cuda_data_ptr<const unsigned long> truth_buffer,
-                const tensor& subnetwork_output,
-                tensor& gradient,
-                double& loss
-            )
+        void compute_loss_cross_entropy_per_logit::do_work(
+            cuda_data_ptr<float> loss_work_buffer,
+            cuda_data_ptr<const unsigned long> truth_buffer,
+            const tensor& input_tensor,
+            const tensor& subnetwork_output,
+            tensor& gradient,
+            double& loss
+        )
         {
             CHECK_CUDA(cudaMemset(gradient.device(), 0, gradient.size() * sizeof(float)));
             CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float)));
@@ -3181,12 +3191,13 @@ namespace dlib
             const long seq_len = subnetwork_output.nr();
             const long vocab_size = subnetwork_output.nc();
 
-            const double scale = 1.0 / batch_size;
+            const double scale = 1.0 / (batch_size * seq_len);
 
             launch_kernel(_cuda_compute_loss_cross_entropy_per_logit, max_jobs(batch_size),
                 loss_work_buffer.data(),
                 gradient.device(),
                 truth_buffer.data(),
+                input_tensor.device(),
                 subnetwork_output.device(),
                 batch_size,
                 seq_len,
 
@@ -667,55 +667,51 @@ namespace dlib
 
     // ----------------------------------------------------------------------------------------
 
-    class compute_loss_cross_entropy_per_logit
-    {
-        /*!
-            The point of this class is to compute the loss computed by
-            loss_cross_entropy_per_logit_, but to do so with CUDA.
-        !*/
-    public:
-
-        compute_loss_cross_entropy_per_logit()
-        {
-        }
-
-        template <typename const_label_iterator>
-        void operator() (
-            const_label_iterator truth,
-            const tensor& subnetwork_output,
-            tensor& gradient,
-            double& loss
-        ) const
+        class compute_loss_cross_entropy_per_logit
         {
-            const size_t bytes_per_sample = sizeof(unsigned long);
-            buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_sample + sizeof(float));
-
-            cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
-            buf = buf+sizeof(float);
+            /*!
+                The point of this class is to compute the loss computed by
+                loss_cross_entropy_per_logit_, but to do so with CUDA
+            !*/
+        public:
+            compute_loss_cross_entropy_per_logit() {}
 
-            for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
+            template <typename const_label_iterator>
+            void operator() (
+                const_label_iterator truth,
+                const tensor& input_tensor,        // Source tokens
+                const tensor& subnetwork_output,   // Logits
+                tensor& gradient,
+                double& loss
+                ) const
             {
-                const unsigned long t = *truth;
-                memcpy(buf + i*bytes_per_sample, &t, bytes_per_sample);
-            }
-
-            auto truth_buf = static_pointer_cast<const unsigned long>(buf, subnetwork_output.num_samples());
+                const size_t bytes_per_sample = sizeof(unsigned long);
+                buf = device_global_buffer(subnetwork_output.num_samples() * bytes_per_sample + sizeof(float));
+                cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
+                buf = buf + sizeof(float);
 
-            do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss);
-        }
+                for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
+                {
+                    const unsigned long t = *truth;
+                    memcpy(buf + i * bytes_per_sample, &t, bytes_per_sample);
+                }
 
-    private:
+                auto truth_buf = static_pointer_cast<const unsigned long>(buf, subnetwork_output.num_samples());
+                do_work(loss_buf, truth_buf, input_tensor, subnetwork_output, gradient, loss);
+            }
 
-        static void do_work(
-            cuda_data_ptr<float> loss_work_buffer,
-            cuda_data_ptr<const unsigned long> truth_buffer,
-            const tensor& subnetwork_output,
-            tensor& gradient,
-            double& loss
-        );
+        private:
+            static void do_work(
+                cuda_data_ptr<float> loss_work_buffer,
+                cuda_data_ptr<const unsigned long> truth_buffer,
+                const tensor& input_tensor,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            );
 
-        mutable cuda_data_void_ptr buf;
-    };
+            mutable cuda_data_void_ptr buf;
+        };
 
     // ----------------------------------------------------------------------------------------