pulp-platform · lukamac · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- Add tile transfer annotation [#127](https://github.com/pulp-platform/Deeploy/pull/127)
 - Refactor Logging for Improved Debugging [#115](https://github.com/pulp-platform/Deeploy/pull/115)
 - Add reuse-tool as an SPDX license header linter [#113](https://github.com/pulp-platform/Deeploy/pull/113)
 - Bug fixes, API Cleanup and Reduce Compiler Warning on PULP [#112](https://github.com/pulp-platform/Deeploy/pull/112)
@@ -46,6 +47,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Buffer utilities: `checkNumLevels` validation and `sizeInBytes` method
 - Per–memory-level usage tracking and worst-case reporting in `NetworkContext`
 - Memory/I/O summaries and input/output logging in deployers
+- Added transfer annotation of tiled execution blocks
 
 ### Changed
 - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`.
@@ -73,6 +75,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Changed types and added correct casts to fix many compiler warnings in the PULP target library
 - Use [reuse-tool](https://github.com/fsfe/reuse-tool) in pre-commit, CI, and Makefile for SPDX license header linting
 - Deployer workflow now uses `prepare(...)` instead of `generateFunction(...)`.
+- Refactored computeTilingRectangles
+- wrapTilingSolution now uses the transfer annotation
 
 ### Fixed
 - Prevent node duplication for graphs generated via GraphSurgeon
@@ -83,6 +87,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Corrected method usage in `importDeeployState` to call `NetworkContext.importNetworkContext` instead of the incorrect method name
 - Correctly return `signProp` from `setupDeployer` instead of hardcoding the value to `False` in `testMVP.py`
 - Fixed `Unsqueeze` Op. when using ONNX opset 13 or higher (from attribute to input)
+- Fixed compiler warning by casting the external pointer in L3Dma to uint32_t
 
 ### Removed
 - Delete outdated and unused `.gitlab-ci.yml` file

@@ -1458,6 +1458,7 @@ def __init__(self, operatorCodeSnippet: Optional[CodeSnippet] = None):
             )  #: Sequence[CodeSnippet]: ordered list of code snippets that need to be generated to implemented the associated operator
 
         self.patternMemoryConstraint: Optional = None  #: Optional[PatternMemoryConstraint]: Tiling information of the operator which is annotated in the midend
+        self.transfers: Optional = None  #: Optional[Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]]: Tiling transfers
 
     def addLeft(self, template: NodeTemplate, operatorRepresentation: OperatorRepresentation):
         """Adds a code snippet that is generated BEFORE any of the other code snippets in this ExecutionBlock

@@ -22,7 +22,7 @@ class L3Dma(AsyncDma):
     _transferTemplates = {
         2:
             NodeTemplate(
-                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+                "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
             )
     }
     _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture)

@@ -19,6 +19,7 @@
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PrototypeTilingMixIn
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
     calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, padShape, stridesFromShape
 
@@ -241,8 +242,13 @@ def apply(self,
                 assert isinstance(buffer, VariableBuffer)
                 unraveledOpRepr[key] = ctxt.unravelReference(buffer).name
 
-        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
-            nodeMemoryConstraint, self.localMemory, ctxt, unraveledOpRepr)
+        tileConstraint: TileConstraint = template.tileConstraint
+        transfers = {
+            tensorName: memTransfers[self.localMemory]
+            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        }
+        variableReplacement, tilingSchedules = tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.localMemory,
+                                                                                 ctxt, unraveledOpRepr, transfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
 

@@ -15,6 +15,7 @@
     _ReferenceBuffer
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerExtension import Tiler
 from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
 
@@ -133,8 +134,13 @@ def apply(self,
             for key, value in operatorRepresentation.items()
         }
 
-        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
-            nodeMemoryConstraint, self.targetMemLevel, ctxt, unraveledOpRepr)
+        tileConstr: TileConstraint = template.tileConstraint
+        transfers = {
+            tensorName: memTransfers[self.targetMemLevel]
+            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        }
+        variableReplacement, tilingSchedules = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel,
+                                                                             ctxt, unraveledOpRepr, transfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
         operatorRepresentation.update(newOpRepr)
@@ -233,8 +239,13 @@ def apply(self,
             for key, value in operatorRepresentation.items()
         }
 
-        variableReplacement, _ = template.tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel,
-                                                                            ctxt, unraveledOpRepr)
+        tileConstr: TileConstraint = template.tileConstraint
+        transfers = {
+            tensorName: memTransfers[self.targetMemLevel]
+            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        }
+        variableReplacement, _ = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel, ctxt,
+                                                               unraveledOpRepr, transfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
         operatorRepresentation.update(newOpRepr)

@@ -2,18 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import copy
 from abc import abstractmethod
 from typing import Dict, List, Optional, Tuple, Union
 
-import numpy as np
 from ortools.constraint_solver.pywrapcp import IntVar
 
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
-from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
-from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, MemoryTransfer, \
-    TilingSchedule, VariableReplacementScheme, computeTileHyperRectangles
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
 
 
 class TileConstraint():
@@ -91,81 +88,17 @@ def sanitizeTilingSchedule(tilingSchedule: TilingSchedule) -> TilingSchedule:
 
     @classmethod
     def wrapTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]:
-
-        def getMemoryTransfer(tensorConstraint: TensorMemoryConstraint, sourceCube: HyperRectangle,
-                              sourceMemoryLevel: str, targetMemoryLevel: str) -> MemoryTransfer:
-
-            size = np.prod(sourceCube.dims)
-            sourceConstraint = MemoryConstraint(sourceMemoryLevel, size)
-            sourceConstraint.shape = sourceCube.dims
-
-            destConstraint = copy.copy(tensorConstraint.memoryConstraints[targetMemoryLevel])
-
-            if any(dim1 > dim2 for dim1, dim2 in zip(destConstraint.shape, sourceConstraint.shape)):
-                destConstraint.shape = sourceConstraint.shape
-
-            return MemoryTransfer(sourceConstraint, destConstraint)
-
-        def _offsetAdd(offsetA: Tuple[int, ...], offsetB: Tuple[int, ...]) -> Tuple[int, ...]:
-            return tuple(dimA + dimB for dimA, dimB in zip(offsetA, offsetB))
-
-        def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List[AbsoluteHyperRectangle],
-                             sourceMemoryLevel: str,
-                             targetMemoryLevel: str) -> Tuple[List[AbsoluteHyperRectangle], List[int]]:
-            solution = []
-            solutionLengths = []
-
-            for sourceCube in sourceCubes:
-                memTransfer = getMemoryTransfer(tensorConstraint, sourceCube.rectangle, sourceMemoryLevel,
-                                                targetMemoryLevel)
-                solutionCubes = computeTileHyperRectangles(memTransfer)
-                solutionAbsoluteCubes = [
-                    AbsoluteHyperRectangle(rectangle = cube,
-                                           absoluteOffset = _offsetAdd(sourceCube.absoluteOffset, cube.offset))
-                    for cube in solutionCubes
-                ]
-                solution += solutionAbsoluteCubes
-                solutionLengths.append(len(solutionAbsoluteCubes))
-
-            return solution, solutionLengths
-
+        cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext,
+        operatorRepresentation: OperatorRepresentation,
+        transfers: Dict[str,
+                        List[List[AbsoluteHyperRectangle]]]) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]:
         assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!"
-
-        outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
-        memoryPath = list(outTensorConstraint.memoryConstraints.keys())
-
-        assert targetMemLevel in memoryPath, \
-            f"Target memory level {targetMemLevel} does not exist in the memory path {memoryPath}"
-
-        targetIdx = memoryPath.index(targetMemLevel)
-
-        if targetIdx == 0:
-            # SCHEREMO: Watch out - this happens if inputs are in L(N+1) but outputs only in L(N)
-            targetIdx = 1
-
-        fullShape = ctxt.lookup(outVar).shape
-        initialOffset = (0,) * len(fullShape)
-        outputCubes = [
-            AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)),
-                                   absoluteOffset = initialOffset)
-        ]
-
-        for source, target in zip(memoryPath[:targetIdx], memoryPath[1:targetIdx + 1]):
-            outputCubes, solutionLengths = getCubeTransfers(outTensorConstraint, outputCubes, source, target)
-
-        arrayOfCubes = []
-        _idx = 0
-        for idxLen in solutionLengths:
-            arrayOfCubes += [outputCubes[_idx:_idx + idxLen]]
-            _idx += idxLen
+        outVar, _ = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
 
         varReplacements = []
         tilingSchedules = []
 
-        for _outputCubes in arrayOfCubes:
-
+        for _outputCubes in transfers[outVar]:
             varReplacement, tilingSchedule = cls.serializeTilingSolution(tilingSolution, _outputCubes, targetMemLevel,
                                                                          ctxt, operatorRepresentation)
             sanitizedTilingSchedule = cls.sanitizeTilingSchedule(tilingSchedule)

@@ -36,6 +36,7 @@
 from Deeploy.TilingExtension.MemoryScheduler import MemoryBlock, MemoryScheduler
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, computeTileHyperRectangles
 
 TilingSolution = List[PatternMemoryConstraints]
 MemoryMap = Dict[str, List[List[MemoryBlock]]]
@@ -940,6 +941,34 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]]
                 assert stepIdx in range(lifetime[0], lifetime[-1] +
                                         1), f"Invalid memory map! Buffer {tensor.name} is not alive at step {stepIdx}!"
 
+    def getTransfers(self, tensorMc: TensorMemoryConstraint) -> Dict[str, List[List[AbsoluteHyperRectangle]]]:
+        transfers: Dict[str, List[List[AbsoluteHyperRectangle]]] = {}
+        mcs = list(tensorMc.memoryConstraints.items())
+        for (externalMemory, externalMc), (localMemory, localMc) in zip(mcs[:-1], mcs[1:]):
+            # TODO: Should we also use externalMemory as a key in the transfers?
+            if externalMemory not in transfers:
+                assert externalMc.shape is not None
+                shape = externalMc.shape
+                zeroOffset = (0,) * len(shape)
+                externalAbsoluteRectangles = [AbsoluteHyperRectangle(HyperRectangle(zeroOffset, shape), zeroOffset)]
+            else:
+                # Flatten
+                externalAbsoluteRectangles = [rect for _list in transfers[externalMemory] for rect in _list]
+
+            transfers[localMemory] = [[
+                AbsoluteHyperRectangle(rect, tuple(a + b
+                                                   for a, b in zip(extAbsRect.absoluteOffset, rect.offset)))
+                for rect in computeTileHyperRectangles(extAbsRect.rectangle.dims, localMc.shape)
+            ]
+                                      for extAbsRect in externalAbsoluteRectangles]
+        return transfers
+
+    def getIoTransfers(self,
+                       patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]:
+        assert len(patternMc.nodeConstraints) == 1, "Only layerwise supported for now!"
+        tMcs = patternMc.nodeConstraints[0].tensorMemoryConstraints
+        return {name: self.getTransfers(mc) for name, mc in tMcs.items()}
+
-    def getIoTransfers(self,
-                       patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]:
-        assert len(patternMc.nodeConstraints) == 1, "Only layerwise supported for now!"
-        tMcs = patternMc.nodeConstraints[0].tensorMemoryConstraints
-        return {name: self.getTransfers(mc) for name, mc in tMcs.items()}
+    def getIoTransfers(self,
+                       patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]:
+        # Prefer layer-wise; if not, use the last step to represent the pattern's effective IO.
+        step_idx = -1
+        tMcs = patternMc.nodeConstraints[step_idx].tensorMemoryConstraints
+        return {name: self.getTransfers(mc) for name, mc in tMcs.items()}
-    def getIoTransfers(self,
-                       patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]:
-        assert len(patternMc.nodeConstraints) == 1, "Only layerwise supported for now!"
-        tMcs = patternMc.nodeConstraints[0].tensorMemoryConstraints
-        return {name: self.getTransfers(mc) for name, mc in tMcs.items()}
+    def getIoTransfers(self,
+                       patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]:
+        # Prefer layer-wise; if not, use the last step to represent the pattern's effective IO.
+        step_idx = -1
+        tMcs = patternMc.nodeConstraints[step_idx].tensorMemoryConstraints
+        return {name: self.getTransfers(mc) for name, mc in tMcs.items()}
 
 class TilerDeployerWrapper(NetworkDeployerWrapper):
 
@@ -996,6 +1025,7 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optio
         # SCHEREMO: Annotate execution block with solution
         for layer, pattern in zip(self.layerBinding.values(), tilingSolution):
             layer.mapper.binder.executionBlock.patternMemoryConstraint = pattern
+            layer.mapper.binder.executionBlock.transfers = self.tiler.getIoTransfers(pattern)
 
         # SCHEREMO: Code generation STUB
 

@@ -11,13 +11,6 @@
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
-from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint
-
-
-@dataclass
-class MemoryTransfer():
-    source: MemoryConstraint
-    destination: MemoryConstraint
 
 
 @dataclass
@@ -242,18 +235,12 @@ def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBu
         (referenceBuffer._type.referencedType.typeWidth // 8))
 
 
-def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]:
-    assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!"
-    assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!"
-
-    assert len(memoryTransfer.source.shape) == len(memoryTransfer.destination.shape), \
-    f"Source and target of memory transfer {memoryTransfer} don't have the same number of dimensions!"
-
-    largeShape = memoryTransfer.source.shape
-    smallShape = memoryTransfer.destination.shape
+def computeTileHyperRectangles(externalShape: Tuple[int, ...], localShape: Tuple[int, ...]) -> List[HyperRectangle]:
+    assert len(externalShape) == len(localShape), \
+    f"External and local memory shapes don't have the same number of dimensions! External {externalShape} vs. Local {localShape}"
 
-    for dimIdx, (dimSizeSmall, dimSizeLarge) in enumerate(zip(smallShape, largeShape)):
-        assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})"
+    # LMACAN: The local shape dimensions are of the local buffer so if the external tile is smaller, that's fine
+    localShape = tuple(min(ext, loc) for ext, loc in zip(externalShape, localShape))
 
     def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
         tileCount = np.prod(tileIndexEnd)
@@ -270,18 +257,18 @@ def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
     tileHyperRectangles = []
 
     tileIndexEnd = [
-        int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(largeShape, smallShape)
+        int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(externalShape, localShape)
     ]
     for tileIndex in nextTileIndex(tileIndexEnd):
-        tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, smallShape))
-        for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, largeShape)):
+        tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, localShape))
+        for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, externalShape)):
             assert dimOffset >= 0, f"tileOffset[{dimIdx}] shoud not be smaller then zero ({dimOffset} < 0)"
             assert dimOffset < dimSizeLarge, f"tileOffset[{dimIdx}] should not be bigger or equal then largeShape[{dimIdx}] ({dimOffset} >= {dimSizeLarge})"
 
         tileSize = tuple(
             min(dimSizeSmall, dimSizeLarge - dimOffset)
-            for dimSizeSmall, dimSizeLarge, dimOffset in zip(smallShape, largeShape, tileOffset))
-        for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, smallShape)):
+            for dimSizeSmall, dimSizeLarge, dimOffset in zip(localShape, externalShape, tileOffset))
+        for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, localShape)):
             assert dimSize > 0, f"tileOffset[{dimIdx}] shoud not be smaller or equal then zero ({dimSize} <= 0)"
             assert dimSize <= dimSizeSmall, f"tileSize[{dimIdx}] should not be bigger then smallShape[{dimIdx}] ({dimSize} > {dimSizeSmall})"