feat: Optimize N-dimensional tensor trace calculation

marvin-hansen · marvin-hansen · commit 8bde169396fd · 2025-11-04T17:37:01.000+08:00
Optimized the trace &lt;subcommand&gt; [options] - record system behavior

trace record: record a trace file
    $ trace record myworkload
        [... Ctrl-C to stop ...]
    $ trace record myworkload --Logging:enable-logs --end-after-duration 5s
    $ trace record myworkload --plan profile --omit Symbolication
    $ trace record myworkload --end-on-notification stop-myworkload-trace
        [... elsewhere `notifyutil -p stop-myworkload-trace` ...]
    $ trace record /tmp/trace-path.atrc --compress

trace amend: add data to a file
    $ trace amend myworkload-003.atrc --add Symbolication

trace trim: trim a file based on kdebug event times
    $ trace trim myworkload-002.atrc --from +1s --to +2s

trace providers: print information about Logging, Symbolication, etc.

trace plans: print detailed information about tracing approaches

See `man trace` for more information. function in  to iterate directly over contributing diagonal elements and their batches. This change avoids a full tensor scan and inefficient index reconstruction, leading to a more efficient trace calculation for N-dimensional tensors.

- Refactored the internal iteration logic within the trace &lt;subcommand&gt; [options] - record system behavior

trace record: record a trace file
    $ trace record myworkload
        [... Ctrl-C to stop ...]
    $ trace record myworkload --Logging:enable-logs --end-after-duration 5s
    $ trace record myworkload --plan profile --omit Symbolication
    $ trace record myworkload --end-on-notification stop-myworkload-trace
        [... elsewhere `notifyutil -p stop-myworkload-trace` ...]
    $ trace record /tmp/trace-path.atrc --compress

trace amend: add data to a file
    $ trace amend myworkload-003.atrc --add Symbolication

trace trim: trim a file based on kdebug event times
    $ trace trim myworkload-002.atrc --from +1s --to +2s

trace providers: print information about Logging, Symbolication, etc.

trace plans: print detailed information about tracing approaches

See `man trace` for more information. function to
  leverage direct index calculation using .
- Eliminated redundant full tensor iteration and conditional checks for diagonal elements.
- Ensured correctness with existing test cases.
- Implemented  helper function for test utilities.
- Fixed  warning.
diff --git a/deep_causality_tensor/src/types/causal_tensor/op_tensor_ein_sum/ein_sum_impl.rs b/deep_causality_tensor/src/types/causal_tensor/op_tensor_ein_sum/ein_sum_impl.rs
@@ -363,28 +363,43 @@ where
         }
 
         let mut result_tensor = CausalTensor::full(&new_shape, T::default());
-        let mut current_index = vec![0; tensor.num_dim()];
-
-        for i in 0..tensor.len() {
-            if current_index[axis1] == current_index[axis2] {
-                let result_index: Vec<usize> = current_index
-                    .iter()
-                    .enumerate()
-                    .filter(|&(i, _)| i != axis1 && i != axis2)
-                    .map(|(_, &val)| val)
-                    .collect();
-
-                if let Some(res_val) = result_tensor.get_mut(&result_index) {
-                    *res_val = res_val.clone() + tensor.data[i].clone();
+        let diag_len = tensor.shape[axis1];
+
+        let mut batch_axes = Vec::new();
+        for i in 0..tensor.num_dim() {
+            if i != axis1 && i != axis2 {
+                batch_axes.push(i);
+            }
+        }
+
+        let num_batch_elements: usize = batch_axes.iter().map(|&ax| tensor.shape[ax]).product();
+        let mut current_batch_indices = vec![0; batch_axes.len()];
+
+        for _ in 0..num_batch_elements {
+            let result_index = current_batch_indices.clone();
+            if let Some(res_val) = result_tensor.get_mut(&result_index) {
+                let mut batch_offset = 0;
+                for (k, &batch_axis) in batch_axes.iter().enumerate() {
+                    batch_offset += current_batch_indices[k] * tensor.strides[batch_axis];
+                }
+
+                let mut diag_sum = T::default();
+                for i in 0..diag_len {
+                    let flat_index = batch_offset + i * tensor.strides[axis1] + i * tensor.strides[axis2];
+                    diag_sum = diag_sum + tensor.data[flat_index].clone();
                 }
+                *res_val = diag_sum;
             }
 
-            for j in (0..tensor.num_dim()).rev() {
-                current_index[j] += 1;
-                if current_index[j] < tensor.shape[j] {
+            // Increment batch indices
+            let mut k = batch_axes.len();
+            while k > 0 {
+                k -= 1;
+                current_batch_indices[k] += 1;
+                if current_batch_indices[k] < tensor.shape[batch_axes[k]] {
                     break;
                 }
-                current_index[j] = 0;
+                current_batch_indices[k] = 0;
             }
         }