Applied Code Rabbit automatic recommendations

diaconuccalin · diaconuccalin · commit f65282b70289 · 2025-11-19T00:44:23.000Z
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
@@ -79,13 +79,11 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
 
-        tensorsShapeLen = len(bufferA.shape)
-
         # ===== EXTRACT TENSOR DIMS AS VARS =====
         ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+                                                   dimIdx = (len(bufferA.shape) - 1) - parseDict['transA'])
         BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+                                                  dimIdx = (len(bufferB.shape) - 2) + parseDict['transB'])
 
         # ===== ADD CONSTRAINTS =====
         # VIC: We don't want to deal with intermediate results between kernel calls
@@ -111,11 +109,15 @@ def serializeTilingSolution(
         buffB = ctxt.lookup(operatorRepresentation['B'])
         buffOut = ctxt.lookup(operatorRepresentation['data_out'])
 
+        transA = operatorRepresentation['transA']
+        transB = operatorRepresentation['transB']
+
         tensorsShapeLenA = len(buffA.shape)
         tensorsShapeLenB = len(buffB.shape)
         tensorsShapeOutput = len(buffOut.shape)
 
-        NSize = buffA.shape[-1]
+        # NSize depends on transA: if transA=0, N is last dim; if transA=1, N is second-to-last
+        NSize = buffA.shape[-1] if transA == 0 else buffA.shape[-2]
         NOffset = 0
 
         # Prepare input cubes lists
@@ -148,9 +150,13 @@ def serializeTilingSolution(
             replacements["batch"].append(BatchSize)
 
             # ===== Compute A cube information =====
-            #   Matrix offsets and shape
-            AMatrixOffsets = (MOffset, NOffset)
-            AMatrixShape = (MSize, NSize)
+            #   Matrix offsets and shape (swap based on transA)
+            if transA == 0:
+                AMatrixOffsets = (MOffset, NOffset)
+                AMatrixShape = (MSize, NSize)
+            else:
+                AMatrixOffsets = (NOffset, MOffset)
+                AMatrixShape = (NSize, MSize)
 
             #   Batch offset and shape (with broadcasting handling)
             ABatchOffsets = list()
@@ -170,9 +176,13 @@ def serializeTilingSolution(
             inputACubes.append(ACube)
 
             # ===== Compute B cube information =====
-            #   Matrix offsets and shape
-            BMatrixOffsets = (NOffset, OOffset)
-            BMatrixShape = (NSize, OSize)
+            #   Matrix offsets and shape (swap based on transB)
+            if transB == 0:
+                BMatrixOffsets = (NOffset, OOffset)
+                BMatrixShape = (NSize, OSize)
+            else:
+                BMatrixOffsets = (OOffset, NOffset)
+                BMatrixShape = (OSize, NSize)
 
             #   Batch offset and shape (with broadcasting handling)
             BBatchOffsets = list()
@@ -206,7 +216,8 @@ def serializeTilingSolution(
         }
 
         # Update load schedule lists
-        for a, b in zip(inputACubes, inputBCubes):
+        # *With strict=True to fail fast if different list lenghts
+        for a, b in zip(inputACubes, inputBCubes, strict = True):
             inputLoadSchedule.append({"A": a, "B": b})
 
         for out in outputCubes:
diff --git a/TargetLibraries/PULPOpen/src/GELU.c b/TargetLibraries/PULPOpen/src/GELU.c
@@ -17,12 +17,12 @@ void PULP_GELU_fp32_fp32(float32_t *data_in, float32_t *data_out,
   int8_t log2Core = LOG2(NUM_CORES);
 
   // Split into chunks for each core
-  int16_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0);
-  int16_t chunk_start = MIN(chunk * core_id, dataSize);
-  int16_t chunk_stop = MIN(chunk_start + chunk, dataSize);
+  int32_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0);
+  int32_t chunk_start = MIN(chunk * core_id, dataSize);
+  int32_t chunk_stop = MIN(chunk_start + chunk, dataSize);
 
   // Compute GELU on the assigned chunk
-  for (uint32_t i = chunk_start; i < chunk_stop; i++) {
+  for (int32_t i = chunk_start; i < chunk_stop; i++) {
     float32_t x = data_in[i];
     float32_t cdf = 0.5f * (1.0f + tanhf((sqrtf(2.0f / (float)M_PI) *
                                           (x + 0.044715f * powf(x, 3.0f)))));