Implement PR feedback

Xeratec · Xeratec · commit 16821235c7e6 · 2025-10-30T17:22:32.000+01:00
diff --git a/Deeploy/Targets/PULPOpen/DMA/MchanDma.py b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py
@@ -18,7 +18,7 @@ class MchanChannelFuture(Future):
     _allocTemplate = NodeTemplate("${name} = mchan_channel_alloc();")
 
     _waitTemplate = NodeTemplate("""
-if (${name} <= MCHAN_TRANSFER_ID_MAX) {
+if (${name} <= MCHAN_CHANNEL_ID_MAX) {
     mchan_channel_wait(${name});
     mchan_channel_free(${name});
 }
diff --git a/Deeploy/TilingExtension/AsyncDma.py b/Deeploy/TilingExtension/AsyncDma.py
@@ -51,9 +51,20 @@ def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
 
 class PerTensorWaitingStrategy(AsyncDmaWaitingStrategy):
 
+    def __init__(self, FutureCls: Type[Future]) -> None:
+        super().__init__(FutureCls)
+        # map (tensorName, direction) -> Future instance so the same Future
+        # object is returned for repeated requests for the same tensor/direction
+        self._futures: Dict[Tuple[str, DmaDirection], Future] = {}
+
     def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
-        _ = direction
-        return self.FutureCls(tensorName)
+        key = (tensorName, direction)
+        if key not in self._futures:
+            # include direction in the future name to avoid accidental name
+            # collisions between directions for the same tensor
+            future_name = f"{tensorName}_{direction}"
+            self._futures[key] = self.FutureCls(future_name)
+        return self._futures[key]
 
 
 class DirectionWaitingStrategy(AsyncDmaWaitingStrategy):
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
@@ -8,7 +8,7 @@
 from Deeploy.AbstractDataTypes import VoidType
 from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation, \
     VariableBuffer, _ReferenceBuffer
-from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, EmptyFuture, Future
+from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
@@ -140,18 +140,15 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
             nextLocalBufferReference = self._hoistReference(ctxt, f"{tensorName}_next", l1BuffersReferences[1])
 
             future = self.dma.getFuture(tensorName, "ExternalToLocal")
-            # Extract the future that is not already in the set of ingress futures
-            _future = set([future]) - ingressFutures
-            _future = _future.pop() if len(_future) > 0 else EmptyFuture("")
-            ingressFutures.add(future)
 
             # 2) Load initial input tiles
             anydimAdapter = AnydimAsyncDmaTransferAdapter(self.dma)
             initialDmaTransferCalls = anydimAdapter.transfer(ctxt, externalBufferRef, localBuffer, rectangles[0].dims,
                                                              stridesFromShape(externalBufferShape),
                                                              stridesFromShape(rectangles[0].dims), "ExternalToLocal",
-                                                             _future, math.prod(externalBufferShape))
-            setupStatements.append(_future.alloc())
+                                                             future, math.prod(externalBufferShape))
+            if future not in ingressFutures:
+                setupStatements.append(future.alloc())
             setupStatements.extend(initialDmaTransferCalls)
 
             # 4.1) Choose buffers for current tile (inputs and outputs)
@@ -161,7 +158,9 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
 
             # 4.2.1) Wait for current input tile
             ingressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for current input tile"}))
-            ingressDMAStatements.append(_future.wait())
+
+            if future not in ingressFutures:
+                ingressDMAStatements.append(future.wait())
 
             # 4.2.2) if there is a next tile:
             ingressDMAStatements.append(
@@ -175,10 +174,15 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
 
             # 4.2.4) Start transfer for next input tile
             ingressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Transfer next input tile"}))
+
+            # Allocate the future for the next transfer
+            if future not in ingressFutures:
+                ingressDMAStatements.append(future.alloc())
+
             ingressDMAStatements.extend(
                 self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I+1", nextLocalBufferReference,
-                                               externalBufferRef, "ExternalToLocal", _future))
-            # 4.2.5) Update external reference for next tile
+                                               externalBufferRef, "ExternalToLocal", future))
+            # 4.2.5) Update external reference for next til
             referenceUpdate = self._generateExternalReferenceUpdate(ctxt, tensorName, rectangles, "TILING_I+1",
                                                                     externalBufferRef)
             if referenceUpdate is not None:
@@ -195,6 +199,9 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
             # Close the "if there is a next tile" block
             ingressDMAStatements.append(CodeSnippet(self._moveTileInCheckCloseStatement, {}))
 
+            # Add future to the set to prevent double wait/allocation
+            ingressFutures.add(future)
+
         # 4.4) Output Data Transfers
         # -----------------------------------
         for tensorName, rectangles in dictOfArrays(tilingSchedule.outputLoadSchedule).items():
@@ -227,19 +234,21 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
 
             # 4.4.1) Wait for previous output tile
             future = self.dma.getFuture(tensorName, "LocalToExternal")
-            # Extract the future that is not already in the set of ingress futures
-            _future = set([future]) - egressFutures
-            _future = _future.pop() if len(_future) > 0 else EmptyFuture("")
-            egressFutures.add(future)
 
             egressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for previous output tile"}))
-            egressDMAStatements.append(_future.wait())
+
+            if future not in egressFutures:
+                egressDMAStatements.append(future.wait())
 
             # 4.4.2) Start transfer for current output tile
             dmaTransferCalls = self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I", localBuffer,
-                                                              externalBufferRef, "LocalToExternal", _future)
+                                                              externalBufferRef, "LocalToExternal", future)
 
             egressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Transfer current output tile"}))
+            # Allocate the future for the next transfer
+            if future not in egressFutures:
+                egressDMAStatements.append(future.alloc())
+
             egressDMAStatements.extend(dmaTransferCalls)
 
             # 4.4.3) Update outut reference for next tile
@@ -248,6 +257,9 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
             if referenceUpdate is not None:
                 egressDMAStatements.append(referenceUpdate)
 
+            # Add future to the set to prevent double wait/allocation
+            egressFutures.add(future)
+
         # 4.2.
         openLoopStatements += self._switch(buffer_choices, "TILING_I")
 
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
@@ -131,15 +131,13 @@ def _generateDmaTransferCalls(self, ctxt: NetworkContext, tensorName: str, trans
                                               math.prod(externalBuffer.shape,))
 
         # Add allocation snippets
-        initSnippets = [future.alloc()] + initSnippets
         templates = [snippet.template for snippet in initSnippets]
         opReprUpdates = [[] for _ in range(len(initSnippets))]
 
         for rect in transfers:
             snippets = anydimAdapter.transfer(ctxt, externalBuffer, localBuffer, rect.dims,
                                               stridesFromShape(externalBuffer.shape), stridesFromShape(rect.dims),
                                               direction, future, math.prod(externalBuffer.shape))
-            snippets = [future.alloc()] + snippets
             for i, snippet in enumerate(snippets):
                 opReprUpdates[i].append(snippet.operatorRepresentation)
 
diff --git a/TargetLibraries/PULPOpen/inc/mchan_v6.h b/TargetLibraries/PULPOpen/inc/mchan_v6.h
@@ -36,7 +36,7 @@
 #include "pmsis.h"
 
 #define MCHAN_TRANSFER_LEN_SIZE (16)
-#define MCHAN_TRANSFER_ID_MAX (15)
+#define MCHAN_CHANNEL_ID_MAX (15)
 
 #define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0))
 #define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0))
@@ -71,17 +71,17 @@ static void mchan_transfer_2d_ext_strided(uint32_t cmd, void *loc, void *ext,
 static uint32_t mchan_channel_alloc() { return *cmd_ptr; }
 
 static void mchan_channel_free(uint32_t channel_id) {
-  assert(channel_id <= MCHAN_TRANSFER_ID_MAX);
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
   *status_ptr = 1 << channel_id;
 }
 
 static uint32_t mchan_channel_is_busy(uint32_t channel_id) {
-  assert(channel_id <= MCHAN_TRANSFER_ID_MAX);
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
   return *status_ptr & (1 << channel_id);
 }
 
 static void mchan_channel_wait(uint32_t channel_id) {
-  assert(channel_id <= MCHAN_TRANSFER_ID_MAX);
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
 #if defined(MCHAN_EVENT)
   while (mchan_channel_is_busy(channel_id))
     eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT);
diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h
@@ -36,7 +36,7 @@
 #include "pmsis.h"
 
 #define MCHAN_TRANSFER_LEN_SIZE (17)
-#define MCHAN_TRANSFER_ID_MAX (15)
+#define MCHAN_CHANNEL_ID_MAX (15)
 
 #define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0))
 #define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0))
@@ -97,17 +97,17 @@ static void mchan_transfer_2d_loc_strided_ext_strided(
 static uint32_t mchan_channel_alloc() { return *cmd_ptr; }
 
 static void mchan_channel_free(uint32_t channel_id) {
-  assert(channel_id <= MCHAN_TRANSFER_ID_MAX);
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
   *status_ptr = 1 << channel_id;
 }
 
 static uint32_t mchan_channel_is_busy(uint32_t channel_id) {
-  assert(channel_id <= MCHAN_TRANSFER_ID_MAX);
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
   return *status_ptr & (1 << channel_id);
 }
 
 static void mchan_channel_wait(uint32_t channel_id) {
-  assert(channel_id <= MCHAN_TRANSFER_ID_MAX);
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
 #if defined(MCHAN_EVENT)
   while (mchan_channel_is_busy(channel_id))
     eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT);

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ class MchanChannelFuture(Future):`
`18`	`18`	`_allocTemplate = NodeTemplate("${name} = mchan_channel_alloc();")`
`19`	`19`
`20`	`20`	`_waitTemplate = NodeTemplate("""`
`21`		`-if (${name} <= MCHAN_TRANSFER_ID_MAX) {`
	`21`	`+if (${name} <= MCHAN_CHANNEL_ID_MAX) {`
`22`	`22`	`mchan_channel_wait(${name});`
`23`	`23`	`mchan_channel_free(${name});`
`24`	`24`	`}`