Clean SSM_CONV code - remove all comments for production

tamarPal · tamarPal · commit 1e5148f5c815 · 2025-10-27T14:21:52.000+02:00
Removed all inline comments and documentation from the implementation.
Clean, minimal code ready for production merge.
diff --git a/ggml/src/ggml-sycl/ssm_conv.cpp b/ggml/src/ggml-sycl/ssm_conv.cpp
@@ -3,24 +3,21 @@
 
 using namespace sycl;
 
-// SSM_CONV kernel: State Space Model Convolution 1D
-// This implements a sliding window convolution with history context
 static void kernel_ssm_conv(
     queue &q,
-    const float *src_data,    // input sequence [d_conv-1+n_t, d_inner, n_s]
-    const float *weights,     // convolution weights [d_conv, d_inner]
-    float *dst_data,          // output [d_inner, n_t, n_s]
-    int d_conv,               // convolution window size
-    int d_inner,              // number of inner channels
-    int n_t,                  // number of tokens to process
-    int n_s,                  // batch size (number of sequences)
-    int ncs __attribute__((unused)),                  // input sequence length (d_conv-1+n_t)
-    int src_stride_inner,     // stride between channels in src
-    int src_stride_seq,       // stride between sequences in src
-    int dst_stride_token,     // stride between tokens in dst
-    int dst_stride_seq        // stride between sequences in dst
+    const float *src_data,
+    const float *weights,
+    float *dst_data,
+    int d_conv,
+    int d_inner,
+    int n_t,
+    int n_s,
+    int ncs __attribute__((unused)),
+    int src_stride_inner,
+    int src_stride_seq,
+    int dst_stride_token,
+    int dst_stride_seq
 ) {
-    // Each work item handles one (channel, token, sequence) combination
     const size_t total_work = d_inner * n_t * n_s;
     const size_t work_group_size = 256;
     const size_t num_work_groups = (total_work + work_group_size - 1) / work_group_size;
@@ -34,31 +31,18 @@ static void kernel_ssm_conv(
             
             if (idx >= total_work) return;
             
-            // Decode indices: idx = seq * (d_inner * n_t) + token * d_inner + channel
             const int channel = idx % d_inner;
             const int token = (idx / d_inner) % n_t;
             const int seq = idx / (d_inner * n_t);
             
-            // Calculate input starting position for this token and channel
-            // Input layout: [d_conv-1+n_t, d_inner, n_s]
-            // Following CPU implementation: s[i0 + i1*ncs] where i0 is conv position, i1 is channel
-            // Note: s pointer is offset by token position for sliding window
             const float *s = src_data + seq * src_stride_seq + channel * src_stride_inner + token;
-            
-            // Get weights for this channel
-            // Weights layout: [d_conv, d_inner]  
-            // Following CPU implementation: c[i0 + i1*nc] where i0 is conv position, i1 is channel
             const float *c = weights + channel * d_conv;
             
-            // Perform dot product: sum(input_window * weights)
-            // Following CPU implementation exactly
             float sumf = 0.0f;
             for (int i0 = 0; i0 < d_conv; ++i0) {
-                sumf += s[i0] * c[i0];  // s[i0 + i1*ncs] * c[i0 + i1*nc] 
+                sumf += s[i0] * c[i0];
             }
             
-            // Write result to output
-            // Output layout: [d_inner, n_t, n_s]
             const size_t dst_idx = seq * dst_stride_seq + 
                                   token * dst_stride_token + 
                                   channel;
@@ -68,41 +52,34 @@ static void kernel_ssm_conv(
 }
 
 void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];  // conv_x: input sequence  
-    ggml_tensor * src1 = dst->src[1];  // conv1d.weight: convolution weights
+    ggml_tensor * src0 = dst->src[0];
+    ggml_tensor * src1 = dst->src[1];
 
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    // Extract dimensions following CPU implementation
-    const int d_conv = src1->ne[0];         // convolution window size
-    const int ncs = src0->ne[0];            // d_conv - 1 + n_t (input sequence length)  
-    const int d_inner = src0->ne[1];        // number of inner channels
-    const int n_t = dst->ne[1];             // number of tokens to process
-    const int n_s = dst->ne[2];             // batch size (number of sequences)
+    const int d_conv = src1->ne[0];
+    const int ncs = src0->ne[0];
+    const int d_inner = src0->ne[1];
+    const int n_t = dst->ne[1];
+    const int n_s = dst->ne[2];
     
-    // Verify dimensions match CPU implementation exactly
-    GGML_ASSERT(src0->ne[0] == d_conv - 1 + n_t); // input length
-    GGML_ASSERT(src0->ne[1] == d_inner);           // channels match
-    GGML_ASSERT(src1->ne[1] == d_inner);           // weight channels match
-    GGML_ASSERT(dst->ne[0] == d_inner);            // output channels
-    GGML_ASSERT(dst->ne[1] == n_t);                // output tokens
-    GGML_ASSERT(dst->ne[2] == n_s);                // output sequences
+    GGML_ASSERT(src0->ne[0] == d_conv - 1 + n_t);
+    GGML_ASSERT(src0->ne[1] == d_inner);
+    GGML_ASSERT(src1->ne[1] == d_inner);
+    GGML_ASSERT(dst->ne[0] == d_inner);
+    GGML_ASSERT(dst->ne[1] == n_t);
+    GGML_ASSERT(dst->ne[2] == n_s);
     
-    // Verify stride assumptions (from CPU implementation)
     GGML_ASSERT(src0->nb[0] == sizeof(float));
     GGML_ASSERT(src1->nb[0] == sizeof(float));  
     GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
     
-    // Calculate strides based on tensor layout (in elements, not bytes)
-    // src0: [d_conv-1+n_t, d_inner, n_s] - input sequence
-    const int src_stride_inner = ncs;                            // stride between channels in elements
-    const int src_stride_seq = ncs * d_inner;                   // stride between sequences in elements
-    
-    // dst: [d_inner, n_t, n_s] - output  
-    const int dst_stride_token = d_inner;                       // stride between tokens in elements
-    const int dst_stride_seq = d_inner * n_t;                   // stride between sequences in elements
+    const int src_stride_inner = ncs;
+    const int src_stride_seq = ncs * d_inner;
+    const int dst_stride_token = d_inner;
+    const int dst_stride_seq = d_inner * n_t;
 
     try {
         queue *q = ctx.stream();
@@ -113,7 +90,6 @@ void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
         
         GGML_ASSERT(src_data && weights && dst_data);
         
-        // Launch kernel
         kernel_ssm_conv(
             *q, src_data, weights, dst_data,
             d_conv, d_inner, n_t, n_s, ncs,