Improve Profiling (#138)

Xeratec · web-flow · commit f7927222d99e · 2025-12-24T12:50:06.000+01:00
This PR improves the log output for tiled executions to split it into kernel execution and pre- and post-kernel time. This is useful to directly assess the control overhead of an execution.

As you can see in the new "Siracusa (Tiled, L3) FloatGEMM" example below, we can conclude that the L2-L1 overhead is minimal while the L3-L2 overhead is rather large. This makes sense as the DMA is implemented in a blocking fashion.

## Added
- Calculate non-kernel overhead and show total time spent during profiling

## Changed
- Profile all memory levels
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- Improve Profiling [#138](https://github.com/pulp-platform/Deeploy/pull/138)
 - FP32 ReduceMean operator improvement [#137](https://github.com/pulp-platform/Deeploy/pull/137)
 - Support for RMSNorm (Pow and Sqrt operators) [#136](https://github.com/pulp-platform/Deeploy/pull/136)
 - Demo TinyViT compatibility with tiled Siracusa [#124](https://github.com/pulp-platform/Deeploy/pull/124)
@@ -81,6 +82,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy`
 - PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used
 - annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations
+- Calculate non-kernel overhead and show total time spent during profiling
 
 ### Changed
 - Structure of Tests subdir for improved ordering
@@ -123,6 +125,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Added missing shape annotation to the testTypeInferenceDifferentTypes
 - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
 - changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
+- Print kernel profiling information for all memory levels
 
 ### Fixed
 - Fixed ReduceMean parallelization and tiling issues described in Issue [#134](https://github.com/pulp-platform/Deeploy/issues/134).
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
@@ -276,17 +276,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
         teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"}))
         teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures)
 
-        metaInfo = TilingMetaInfo(
-            nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
-            nodeOps = operatorRepresentation['nodeOps'],
-            numTiles = operatorRepresentation['numTiles'],
-            totalNumTiles = len(tilingSchedule.outputLoadSchedule),
-            tileIdxPtr = operatorRepresentation['tileIdxPtr'],
-            tileIdxVar = "TILING_I",
-            # TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel.
-            #       The current implementation does this by checking whether we are at the lowest memory level,
-            #       which is hardcoded by the value "L1". Change this to be memory level agnostic.
-            kernelLevelTiling = self.localMemory == "L1")
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = operatorRepresentation['numTiles'],
+                                  totalNumTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxPtr = operatorRepresentation['tileIdxPtr'],
+                                  tileIdxVar = "TILING_I",
+                                  kernelLevelTiling = True)
 
         executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
                                                     openLoopStatements, closeLoopStatements, setupStatements,
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
@@ -117,17 +117,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
 
         closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})]
 
-        metaInfo = TilingMetaInfo(
-            nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
-            nodeOps = operatorRepresentation['nodeOps'],
-            numTiles = operatorRepresentation['numTiles'],
-            totalNumTiles = len(tilingSchedule.outputLoadSchedule),
-            tileIdxPtr = operatorRepresentation['tileIdxPtr'],
-            tileIdxVar = "TILING_I",
-            # TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel.
-            #       The current implementation does this by checking whether we are at the lowest memory level,
-            #       which is hardcoded by the value "L1". Change this to be memory level agnostic.
-            kernelLevelTiling = self.localMemory == "L1")
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = operatorRepresentation['numTiles'],
+                                  totalNumTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxPtr = operatorRepresentation['tileIdxPtr'],
+                                  tileIdxVar = "TILING_I",
+                                  kernelLevelTiling = True)
 
         executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
                                                     openLoopStatements, closeLoopStatements, setupStatements,
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
@@ -79,14 +79,27 @@ class ProfilingPrototypeMixIn(ABC):
 
     _printLoopSetup = NodeTemplate("""
     StopTimer();
+    printf("===== Profiling ${nodeName} =====\\n");
     for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0);
         ${profileIdxVar} < ${numTiles}[*${tileIdxPtr}];
         ${profileIdxVar}++){
     """)
 
-    _printCycleDifference = NodeTemplate(r"""
-    printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
-    ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
+    _measurementDeclaration = NodeTemplate("""
+    uint32_t ${measurement} = ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}];
+    """)
+
+    _printCycleDifference = NodeTemplate("""
+    printf("%s%u] %s%6u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
+    ${measurement}, ${suffixStr});
+    """)
+
+    _printCycleContribution = NodeTemplate("""
+    uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput};
+    uint32_t dma = ${measurementInput} + ${measurementOutput};
+    float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total;
+    float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total;
+    printf("%s%u] Total      :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, overhead_percentage    , ${measurementKernel}, dma);
     """)
 
     _printLoopTeardown = NodeTemplate("""
@@ -151,13 +164,37 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
             "tileIdxPtr": tileIdxPtr,
         })
 
+        executionBlock.addRight(
+            cls._measurementDeclaration, {
+                "measurement": f"{nodeName}_ingress_dma_wait_measurement",
+                "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
+                "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
+                "profileIdxVar": profileIdxVar,
+            })
+
+        if metaInfo.kernelLevelTiling:
+            executionBlock.addRight(
+                cls._measurementDeclaration, {
+                    "measurement": f"{nodeName}_kernel_measurement",
+                    "measurementsStart": f"{nodeName}_kernel_start_measurements",
+                    "measurementsEnd": f"{nodeName}_kernel_end_measurements",
+                    "profileIdxVar": profileIdxVar,
+                })
+
+        executionBlock.addRight(
+            cls._measurementDeclaration, {
+                "measurement": f"{nodeName}_egress_dma_wait_measurement",
+                "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
+                "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
+                "profileIdxVar": profileIdxVar,
+            })
+
         executionBlock.addRight(
             cls._printCycleDifference, {
                 "prefixStr": f"{nodeName}_prefix",
                 "suffixStr": f"{nodeName}_suffix",
-                "flavorStr": "Input DMA took ",
-                "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
-                "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
+                "flavorStr": "Pre-Kernel :",
+                "measurement": f"{nodeName}_ingress_dma_wait_measurement",
                 "profileIdxVar": profileIdxVar,
             })
 
@@ -166,22 +203,32 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
                 cls._printCycleDifference, {
                     "prefixStr": f"{nodeName}_prefix",
                     "suffixStr": f"{nodeName}_suffix",
-                    "flavorStr": "Kernel took ",
-                    "measurementsStart": f"{nodeName}_kernel_start_measurements",
-                    "measurementsEnd": f"{nodeName}_kernel_end_measurements",
+                    "flavorStr": "Kernel     :",
+                    "measurement": f"{nodeName}_kernel_measurement",
                     "profileIdxVar": profileIdxVar,
                 })
 
         executionBlock.addRight(
             cls._printCycleDifference, {
                 "prefixStr": f"{nodeName}_prefix",
                 "suffixStr": f"{nodeName}_suffix",
-                "flavorStr": "Output DMA took ",
-                "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
-                "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
+                "flavorStr": "Post-Kernel:",
+                "measurement": f"{nodeName}_egress_dma_wait_measurement",
                 "profileIdxVar": profileIdxVar,
             })
 
+        # Total Time: Input + Kernel + Output
+        # Overhead: (Input + Output) / Total
+        if metaInfo.kernelLevelTiling:
+            executionBlock.addRight(
+                cls._printCycleContribution, {
+                    "prefixStr": f"{nodeName}_prefix",
+                    "measurementInput": f"{nodeName}_ingress_dma_wait_measurement",
+                    "measurementKernel": f"{nodeName}_kernel_measurement",
+                    "measurementOutput": f"{nodeName}_egress_dma_wait_measurement",
+                    "profileIdxVar": profileIdxVar,
+                })
+
         executionBlock.addRight(cls._printLoopTeardown, {})
 
         return executionBlock