Skip to content

Commit f792722

Browse files
authored
Improve Profiling (#138)
This PR improves the log output for tiled executions to split it into kernel execution and pre- and post-kernel time. This is useful to directly assess the control overhead of an execution. As you can see in the new "Siracusa (Tiled, L3) FloatGEMM" example below, we can conclude that the L2-L1 overhead is minimal while the L3-L2 overhead is rather large. This makes sense as the DMA is implemented in a blocking fashion. ## Added - Calculate non-kernel overhead and show total time spent during profiling ## Changed - Profile all memory levels
1 parent 6db943f commit f792722

File tree

4 files changed

+76
-34
lines changed

4 files changed

+76
-34
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
44
## Unreleased (Planned Release Target: v0.2.1)
55

66
### List of Pull Requests
7+
- Improve Profiling [#138](https://github.com/pulp-platform/Deeploy/pull/138)
78
- FP32 ReduceMean operator improvement [#137](https://github.com/pulp-platform/Deeploy/pull/137)
89
- Support for RMSNorm (Pow and Sqrt operators) [#136](https://github.com/pulp-platform/Deeploy/pull/136)
910
- Demo TinyViT compatibility with tiled Siracusa [#124](https://github.com/pulp-platform/Deeploy/pull/124)
@@ -81,6 +82,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
8182
- Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy`
8283
- PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used
8384
- annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations
85+
- Calculate non-kernel overhead and show total time spent during profiling
8486

8587
### Changed
8688
- Structure of Tests subdir for improved ordering
@@ -123,6 +125,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
123125
- Added missing shape annotation to the testTypeInferenceDifferentTypes
124126
- Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
125127
- changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
128+
- Print kernel profiling information for all memory levels
126129

127130
### Fixed
128131
- Fixed ReduceMean parallelization and tiling issues described in Issue [#134](https://github.com/pulp-platform/Deeploy/issues/134).

Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -276,17 +276,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
276276
teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"}))
277277
teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures)
278278

279-
metaInfo = TilingMetaInfo(
280-
nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
281-
nodeOps = operatorRepresentation['nodeOps'],
282-
numTiles = operatorRepresentation['numTiles'],
283-
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
284-
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
285-
tileIdxVar = "TILING_I",
286-
# TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel.
287-
# The current implementation does this by checking whether we are at the lowest memory level,
288-
# which is hardcoded by the value "L1". Change this to be memory level agnostic.
289-
kernelLevelTiling = self.localMemory == "L1")
279+
metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
280+
nodeOps = operatorRepresentation['nodeOps'],
281+
numTiles = operatorRepresentation['numTiles'],
282+
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
283+
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
284+
tileIdxVar = "TILING_I",
285+
kernelLevelTiling = True)
290286

291287
executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
292288
openLoopStatements, closeLoopStatements, setupStatements,

Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -117,17 +117,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
117117

118118
closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})]
119119

120-
metaInfo = TilingMetaInfo(
121-
nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
122-
nodeOps = operatorRepresentation['nodeOps'],
123-
numTiles = operatorRepresentation['numTiles'],
124-
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
125-
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
126-
tileIdxVar = "TILING_I",
127-
# TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel.
128-
# The current implementation does this by checking whether we are at the lowest memory level,
129-
# which is hardcoded by the value "L1". Change this to be memory level agnostic.
130-
kernelLevelTiling = self.localMemory == "L1")
120+
metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
121+
nodeOps = operatorRepresentation['nodeOps'],
122+
numTiles = operatorRepresentation['numTiles'],
123+
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
124+
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
125+
tileIdxVar = "TILING_I",
126+
kernelLevelTiling = True)
131127

132128
executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
133129
openLoopStatements, closeLoopStatements, setupStatements,

Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,27 @@ class ProfilingPrototypeMixIn(ABC):
7979

8080
_printLoopSetup = NodeTemplate("""
8181
StopTimer();
82+
printf("===== Profiling ${nodeName} =====\\n");
8283
for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0);
8384
${profileIdxVar} < ${numTiles}[*${tileIdxPtr}];
8485
${profileIdxVar}++){
8586
""")
8687

87-
_printCycleDifference = NodeTemplate(r"""
88-
printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
89-
${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
88+
_measurementDeclaration = NodeTemplate("""
89+
uint32_t ${measurement} = ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}];
90+
""")
91+
92+
_printCycleDifference = NodeTemplate("""
93+
printf("%s%u] %s%6u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
94+
${measurement}, ${suffixStr});
95+
""")
96+
97+
_printCycleContribution = NodeTemplate("""
98+
uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput};
99+
uint32_t dma = ${measurementInput} + ${measurementOutput};
100+
float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total;
101+
float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total;
102+
printf("%s%u] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, overhead_percentage , ${measurementKernel}, dma);
90103
""")
91104

92105
_printLoopTeardown = NodeTemplate("""
@@ -151,13 +164,37 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
151164
"tileIdxPtr": tileIdxPtr,
152165
})
153166

167+
executionBlock.addRight(
168+
cls._measurementDeclaration, {
169+
"measurement": f"{nodeName}_ingress_dma_wait_measurement",
170+
"measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
171+
"measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
172+
"profileIdxVar": profileIdxVar,
173+
})
174+
175+
if metaInfo.kernelLevelTiling:
176+
executionBlock.addRight(
177+
cls._measurementDeclaration, {
178+
"measurement": f"{nodeName}_kernel_measurement",
179+
"measurementsStart": f"{nodeName}_kernel_start_measurements",
180+
"measurementsEnd": f"{nodeName}_kernel_end_measurements",
181+
"profileIdxVar": profileIdxVar,
182+
})
183+
184+
executionBlock.addRight(
185+
cls._measurementDeclaration, {
186+
"measurement": f"{nodeName}_egress_dma_wait_measurement",
187+
"measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
188+
"measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
189+
"profileIdxVar": profileIdxVar,
190+
})
191+
154192
executionBlock.addRight(
155193
cls._printCycleDifference, {
156194
"prefixStr": f"{nodeName}_prefix",
157195
"suffixStr": f"{nodeName}_suffix",
158-
"flavorStr": "Input DMA took ",
159-
"measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
160-
"measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
196+
"flavorStr": "Pre-Kernel :",
197+
"measurement": f"{nodeName}_ingress_dma_wait_measurement",
161198
"profileIdxVar": profileIdxVar,
162199
})
163200

@@ -166,22 +203,32 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
166203
cls._printCycleDifference, {
167204
"prefixStr": f"{nodeName}_prefix",
168205
"suffixStr": f"{nodeName}_suffix",
169-
"flavorStr": "Kernel took ",
170-
"measurementsStart": f"{nodeName}_kernel_start_measurements",
171-
"measurementsEnd": f"{nodeName}_kernel_end_measurements",
206+
"flavorStr": "Kernel :",
207+
"measurement": f"{nodeName}_kernel_measurement",
172208
"profileIdxVar": profileIdxVar,
173209
})
174210

175211
executionBlock.addRight(
176212
cls._printCycleDifference, {
177213
"prefixStr": f"{nodeName}_prefix",
178214
"suffixStr": f"{nodeName}_suffix",
179-
"flavorStr": "Output DMA took ",
180-
"measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
181-
"measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
215+
"flavorStr": "Post-Kernel:",
216+
"measurement": f"{nodeName}_egress_dma_wait_measurement",
182217
"profileIdxVar": profileIdxVar,
183218
})
184219

220+
# Total Time: Input + Kernel + Output
221+
# Overhead: (Input + Output) / Total
222+
if metaInfo.kernelLevelTiling:
223+
executionBlock.addRight(
224+
cls._printCycleContribution, {
225+
"prefixStr": f"{nodeName}_prefix",
226+
"measurementInput": f"{nodeName}_ingress_dma_wait_measurement",
227+
"measurementKernel": f"{nodeName}_kernel_measurement",
228+
"measurementOutput": f"{nodeName}_egress_dma_wait_measurement",
229+
"profileIdxVar": profileIdxVar,
230+
})
231+
185232
executionBlock.addRight(cls._printLoopTeardown, {})
186233

187234
return executionBlock

0 commit comments

Comments
 (0)