spcl
diff --git a/‎.github/workflows/general-ci.yml‎
Lines changed: 17 additions & 1 deletion b/‎.github/workflows/general-ci.yml‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎dace/cli/dacelab.py‎
Lines changed: 1 addition & 1 deletion b/‎dace/cli/dacelab.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dace/codegen/compiled_sdfg.py‎
Lines changed: 56 additions & 0 deletions b/‎dace/codegen/compiled_sdfg.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎dace/codegen/targets/cpp.py‎
Lines changed: 5 additions & 2 deletions b/‎dace/codegen/targets/cpp.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎dace/frontend/python/newast.py‎
Lines changed: 3 additions & 0 deletions b/‎dace/frontend/python/newast.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎dace/frontend/python/replacements/linalg.py‎
Lines changed: 4 additions & 5 deletions b/‎dace/frontend/python/replacements/linalg.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎dace/libraries/blas/nodes/batched_matmul.py‎
Lines changed: 63 additions & 18 deletions b/‎dace/libraries/blas/nodes/batched_matmul.py‎
Lines changed: 63 additions & 18 deletions
@@ -59,7 +59,7 @@ jobs:
         else
             export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
         fi
-        pytest -n auto --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "not gpu and not verilator and not tensorflow and not mkl and not sve and not papi and not mlir and not lapack and not fpga and not mpi and not rtl_hardware and not scalapack and not datainstrument and not long"
+        pytest -n auto --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "not gpu and not verilator and not tensorflow and not mkl and not sve and not papi and not mlir and not lapack and not fpga and not mpi and not rtl_hardware and not scalapack and not datainstrument and not long and not sequential"
         ./codecov
 
     - name: Test OpenBLAS LAPACK
@@ -78,6 +78,22 @@ jobs:
         pytest -n 1 --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "lapack"
         ./codecov
 
+    - name: Run sequential tests
+      run: |
+        export NOSTATUSBAR=1
+        export DACE_testing_serialization=1
+        export DACE_testing_deserialize_exception=1
+        export DACE_cache=unique
+        if [ "${{ matrix.simplify }}" = "autoopt" ]; then
+            export DACE_optimizer_automatic_simplification=1
+            export DACE_optimizer_autooptimize=1
+            echo "Auto-optimization heuristics"
+        else
+            export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
+        fi
+        pytest -n 1 --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "sequential"
+        ./codecov
+
     - name: Run other tests
       run: |
         export NOSTATUSBAR=1
 
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
 
 import argparse
 from dace.frontend.octave import parse
 
@@ -7,6 +7,9 @@
 import subprocess
 from typing import Any, Callable, Dict, List, Tuple, Optional, Type, Union
 import warnings
+import tempfile
+import pickle
+import sys
 
 import numpy as np
 import sympy as sp
@@ -414,6 +417,59 @@ def __call__(self, *args, **kwargs):
         # Return values are cached in `self._lastargs`.
         return self.fast_call(argtuple, initargtuple, do_gpu_check=True)
 
+    def safe_call(self, *args, **kwargs):
+        """
+        Forwards the Python call to the compiled ``SDFG`` in a separate process to avoid crashes in the main process. Raises an exception if the SDFG execution fails.
+        """
+
+        # Pickle the SDFG and arguments
+        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f:
+            pickle.dump({
+                'library_path': self._lib._library_filename,
+                "sdfg": self.sdfg,
+                'args': args,
+                'kwargs': kwargs
+            }, f)
+            temp_path = f.name
+
+        # Call the SDFG in a separate process
+        result = subprocess.run([
+            sys.executable, '-c', f'''
+import pickle
+from dace.codegen import compiled_sdfg as csd
+
+with open(r"{temp_path}", "rb") as f:
+    data = pickle.load(f)
+library_path = data['library_path']
+sdfg = data['sdfg']
+
+lib = csd.ReloadableDLL(library_path, sdfg.name)
+obj = csd.CompiledSDFG(sdfg, lib, sdfg.arg_names)
+obj(*data['args'], **data['kwargs'])
+
+with open(r"{temp_path}", "wb") as f:
+    pickle.dump({{
+        'args': data['args'],
+        'kwargs': data['kwargs']
+    }}, f)
+             '''
+        ])
+
+        # Receive the result
+        with open(temp_path, 'rb') as f:
+            data = pickle.load(f)
+            for i in range(len(args)):
+                if hasattr(args[i], '__setitem__'):
+                    args[i].__setitem__(slice(None), data['args'][i])
+            for k in kwargs:
+                if hasattr(kwargs[k], '__setitem__'):
+                    kwargs[k].__setitem__(slice(None), data['kwargs'][k])
+
+        # Clean up
+        os.remove(temp_path)
+        if result.returncode != 0:
+            raise RuntimeError(f'SDFG execution failed with return code {result.returncode}.')
+
     def fast_call(
         self,
         callargs: Tuple[Any, ...],
 
@@ -1187,8 +1187,11 @@ def _subscript_expr(self, slicenode: ast.AST, target: str) -> symbolic.SymbolicT
             # - Soft-squeeze the slice (remove unit-modes) to match the treatment of the strides above.
             if target not in self.constants:
                 desc = self.sdfg.arrays[dname]
-                if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
-                    elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+                if sum(1 for s in desc.shape if s != 1) != len(visited_slice.elts):
+                    if isinstance(desc, data.Array) and data._prod(desc.shape) != 1:
+                        elts = [e for i, e in enumerate(visited_slice.elts) if desc.shape[i] != 1]
+                else:
+                    elts = visited_slice.elts
             else:
                 elts = visited_slice.elts
             if len(strides) != len(elts):
 
@@ -536,6 +536,9 @@ def add_indirection_subgraph(sdfg: SDFG,
         for i, idx in enumerate(nonsqz_dims):
             newsubset[idx] = '__i%d' % i
 
+    # Squeeze size-1 dimensions out of expression
+    newsubset = [s for shp, s in zip(array.shape, newsubset) if shp != 1]
+
     tasklet.code = CodeBlock(
         code.format(arr='__ind_' + local_name, index=', '.join([symbolic.symstr(s) for s in newsubset])))
 
 
@@ -28,9 +28,6 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
 
     if len(arr1.shape) > 1 and len(arr2.shape) > 1:  # matrix * matrix
 
-        if len(arr1.shape) > 3 or len(arr2.shape) > 3:
-            raise SyntaxError('Matrix multiplication of tensors of dimensions > 3 not supported')
-
         res = symbolic.equal(arr1.shape[-1], arr2.shape[-2])
         if res is None:
             warnings.warn(
@@ -41,10 +38,12 @@ def _matmult(visitor: ProgramVisitor, sdfg: SDFG, state: SDFGState, op1: str, op
 
         from dace.libraries.blas.nodes.matmul import _get_batchmm_opts
 
-        # Determine batched multiplication
+        # Determine batched multiplication (supports N-D tensors)
         bopt = _get_batchmm_opts(arr1.shape, arr1.strides, arr2.shape, arr2.strides, None, None)
         if bopt:
-            output_shape = (bopt['b'], arr1.shape[-2], arr2.shape[-1])
+            # Multi-dimensional batch: use batch_dims if available, otherwise use flattened batch size
+            batch_dims = bopt.get('batch_dims', [bopt['b']])
+            output_shape = tuple(batch_dims) + (arr1.shape[-2], arr2.shape[-1])
         else:
             output_shape = (arr1.shape[-2], arr2.shape[-1])
 
 
@@ -32,8 +32,12 @@ def make_sdfg(node, parent_state, parent_sdfg):
                           UserWarning)
         elif not res:
             raise SyntaxError("Matrix sizes must match")
+
+        # Determine output shape based on batch options
         if bopt:
-            shape_c = (bopt['b'], shape_a[-2], shape_b[-1])
+            # Use batch dimensions from bopt (may be multi-dimensional)
+            batch_dims = bopt.get('batch_dims', [bopt['b']])
+            shape_c = tuple(batch_dims) + (shape_a[-2], shape_b[-1])
         else:
             shape_c = (shape_a[-2], shape_b[-1])
 
@@ -64,16 +68,46 @@ def make_sdfg(node, parent_state, parent_sdfg):
 
         state = sdfg.add_state_after(init_state, node.label + "_state")
 
-        state.add_mapped_tasklet(
-            '_BatchedBatchedMatMult_', {
-                '__i%d' % i: '0:%s' % s
-                for i, s in enumerate([bopt['b'], array_a.shape[-2], array_b.shape[-1], array_a.shape[-1]])
-            }, {
-                '__a': dace.Memlet.simple("_a", ('__i1, __i3' if len(array_a.shape) == 2 else '__i0, __i1, __i3')),
-                '__b': dace.Memlet.simple("_b", ('__i3, __i2' if len(array_b.shape) == 2 else '__i0, __i3, __i2'))
-            },
-            '__c = __a * __b', {'__c': dace.Memlet.simple("_c", '__i0, __i1, __i2', wcr_str='lambda x, y: x + y')},
-            external_edges=True)
+        # Calculate number of batch dimensions in output
+        num_batch_dims = len(shape_c) - 2
+
+        # Build map parameters: batch dimensions + M, N, K
+        map_params = {}
+        for i in range(num_batch_dims):
+            map_params['__i%d' % i] = '0:%s' % symstr(shape_c[i])
+
+        # M, N, K dimensions
+        map_params['__im'] = '0:%s' % symstr(shape_a[-2])
+        map_params['__in'] = '0:%s' % symstr(shape_b[-1])
+        map_params['__ik'] = '0:%s' % symstr(shape_a[-1])
+
+        # Build memlet access patterns
+        # For A: if 2D, use [M, K]; if 3D+, use [batch_indices..., M, K]
+        if len(array_a.shape) == 2:
+            memlet_a = '__im, __ik'
+        else:
+            # Use output batch indices
+            a_batch_indices = ', '.join(['__i%d' % i for i in range(len(array_a.shape) - 2)])
+            memlet_a = f'{a_batch_indices}, __im, __ik'
+
+        # For B: if 2D, use [K, N]; if 3D+, use [batch_indices..., K, N]
+        if len(array_b.shape) == 2:
+            memlet_b = '__ik, __in'
+        else:
+            b_batch_indices = ', '.join(['__i%d' % i for i in range(len(array_b.shape) - 2)])
+            memlet_b = f'{b_batch_indices}, __ik, __in'
+
+        # For C: always has batch dimensions
+        c_indices = ', '.join(['__i%d' % i for i in range(num_batch_dims)]) + ', __im, __in'
+
+        state.add_mapped_tasklet('_BatchedMatMult_',
+                                 map_params, {
+                                     '__a': dace.Memlet.simple("_a", memlet_a),
+                                     '__b': dace.Memlet.simple("_b", memlet_b)
+                                 },
+                                 '__c = __a * __b',
+                                 {'__c': dace.Memlet.simple("_c", c_indices, wcr_str='lambda x, y: x + y')},
+                                 external_edges=True)
 
         return sdfg
 
@@ -441,20 +475,31 @@ def validate(self, sdfg, state):
             raise ValueError("Expected exactly one output from "
                              "batched matrix-matrix product")
         out_memlet = out_edges[0].data
-        # Function is symmetric, edge order does not matter
-        if len(size0) not in [2, 3]:
-            raise ValueError("Batched matrix-matrix product only supported on matrices")
-        if len(size1) != 3:
-            raise ValueError("Batched matrix-matrix product only supported on matrices")
+
+        # Both inputs must be at least 2D
+        if len(size0) < 2:
+            raise ValueError(f"First input must be at least 2D, got shape with {len(size0)} dimensions")
+        if len(size1) < 2:
+            raise ValueError(f"Second input must be at least 2D, got shape with {len(size1)} dimensions")
+
+        # At least one input must have batch dimensions (3D or higher) for batched operation
+        if len(size0) <= 2 and len(size1) <= 2:
+            raise ValueError(
+                "Batched matrix-matrix product requires at least one input to have batch dimensions (3D or higher)")
+
+        # Validate K-dimension compatibility
         res = equal(size0[-1], size1[-2])
         if res is None:
             warnings.warn(
                 f'First tensor\'s last mode {size0[-1]} and second tensor\'s second-last mode {size1[-2]} '
                 f'may not match', UserWarning)
         elif not res:
             raise ValueError("Inputs to matrix-matrix product must agree in the k-dimension")
-        if len(out_memlet.subset) != 3:
-            raise ValueError("batched matrix-matrix product only supported on matrices")
+
+        # Output must have batch dimensions
+        if len(out_memlet.subset) < 3:
+            raise ValueError(
+                f"Batched matrix-matrix product output must be at least 3D, got {len(out_memlet.subset)} dimensions")
 
 
 # Numpy replacement