88from Deeploy .AbstractDataTypes import VoidType
99from Deeploy .DeeployTypes import CodeSnippet , ExecutionBlock , NetworkContext , NodeTemplate , OperatorRepresentation , \
1010 VariableBuffer , _ReferenceBuffer
11- from Deeploy .TilingExtension .AsyncDma import AnydimAsyncDmaTransferAdapter , AsyncDma , EmptyFuture , Future
11+ from Deeploy .TilingExtension .AsyncDma import AnydimAsyncDmaTransferAdapter , AsyncDma , Future
1212from Deeploy .TilingExtension .CodeTransformationPasses .TilingCodeGeneration import TilingCodeGeneration
1313from Deeploy .TilingExtension .CodeTransformationPasses .TilingHoistingMixIn import dictOfArrays
1414from Deeploy .TilingExtension .CodeTransformationPasses .TilingPrototypes import ProfilingPrototypeMixIn , \
@@ -140,18 +140,15 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
140140 nextLocalBufferReference = self ._hoistReference (ctxt , f"{ tensorName } _next" , l1BuffersReferences [1 ])
141141
142142 future = self .dma .getFuture (tensorName , "ExternalToLocal" )
143- # Extract the future that is not already in the set of ingress futures
144- _future = set ([future ]) - ingressFutures
145- _future = _future .pop () if len (_future ) > 0 else EmptyFuture ("" )
146- ingressFutures .add (future )
147143
148144 # 2) Load initial input tiles
149145 anydimAdapter = AnydimAsyncDmaTransferAdapter (self .dma )
150146 initialDmaTransferCalls = anydimAdapter .transfer (ctxt , externalBufferRef , localBuffer , rectangles [0 ].dims ,
151147 stridesFromShape (externalBufferShape ),
152148 stridesFromShape (rectangles [0 ].dims ), "ExternalToLocal" ,
153- _future , math .prod (externalBufferShape ))
154- setupStatements .append (_future .alloc ())
149+ future , math .prod (externalBufferShape ))
150+ if future not in ingressFutures :
151+ setupStatements .append (future .alloc ())
155152 setupStatements .extend (initialDmaTransferCalls )
156153
157154 # 4.1) Choose buffers for current tile (inputs and outputs)
@@ -161,7 +158,9 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
161158
162159 # 4.2.1) Wait for current input tile
163160 ingressDMAStatements .append (CodeSnippet (self ._lineComment , {"comment" : "Wait for current input tile" }))
164- ingressDMAStatements .append (_future .wait ())
161+
162+ if future not in ingressFutures :
163+ ingressDMAStatements .append (future .wait ())
165164
166165 # 4.2.2) if there is a next tile:
167166 ingressDMAStatements .append (
@@ -175,10 +174,15 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
175174
176175 # 4.2.4) Start transfer for next input tile
177176 ingressDMAStatements .append (CodeSnippet (self ._lineComment , {"comment" : "Transfer next input tile" }))
177+
178+ # Allocate the future for the next transfer
179+ if future not in ingressFutures :
180+ ingressDMAStatements .append (future .alloc ())
181+
178182 ingressDMAStatements .extend (
179183 self ._generateDmaTransferCalls (ctxt , tensorName , rectangles , "TILING_I+1" , nextLocalBufferReference ,
180- externalBufferRef , "ExternalToLocal" , _future ))
181- # 4.2.5) Update external reference for next tile
184+ externalBufferRef , "ExternalToLocal" , future ))
185+ # 4.2.5) Update external reference for next til
182186 referenceUpdate = self ._generateExternalReferenceUpdate (ctxt , tensorName , rectangles , "TILING_I+1" ,
183187 externalBufferRef )
184188 if referenceUpdate is not None :
@@ -195,6 +199,9 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
195199 # Close the "if there is a next tile" block
196200 ingressDMAStatements .append (CodeSnippet (self ._moveTileInCheckCloseStatement , {}))
197201
202+ # Add future to the set to prevent double wait/allocation
203+ ingressFutures .add (future )
204+
198205 # 4.4) Output Data Transfers
199206 # -----------------------------------
200207 for tensorName , rectangles in dictOfArrays (tilingSchedule .outputLoadSchedule ).items ():
@@ -227,19 +234,21 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
227234
228235 # 4.4.1) Wait for previous output tile
229236 future = self .dma .getFuture (tensorName , "LocalToExternal" )
230- # Extract the future that is not already in the set of ingress futures
231- _future = set ([future ]) - egressFutures
232- _future = _future .pop () if len (_future ) > 0 else EmptyFuture ("" )
233- egressFutures .add (future )
234237
235238 egressDMAStatements .append (CodeSnippet (self ._lineComment , {"comment" : "Wait for previous output tile" }))
236- egressDMAStatements .append (_future .wait ())
239+
240+ if future not in egressFutures :
241+ egressDMAStatements .append (future .wait ())
237242
238243 # 4.4.2) Start transfer for current output tile
239244 dmaTransferCalls = self ._generateDmaTransferCalls (ctxt , tensorName , rectangles , "TILING_I" , localBuffer ,
240- externalBufferRef , "LocalToExternal" , _future )
245+ externalBufferRef , "LocalToExternal" , future )
241246
242247 egressDMAStatements .append (CodeSnippet (self ._lineComment , {"comment" : "Transfer current output tile" }))
248+ # Allocate the future for the next transfer
249+ if future not in egressFutures :
250+ egressDMAStatements .append (future .alloc ())
251+
243252 egressDMAStatements .extend (dmaTransferCalls )
244253
245254 # 4.4.3) Update outut reference for next tile
@@ -248,6 +257,9 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
248257 if referenceUpdate is not None :
249258 egressDMAStatements .append (referenceUpdate )
250259
260+ # Add future to the set to prevent double wait/allocation
261+ egressFutures .add (future )
262+
251263 # 4.2.
252264 openLoopStatements += self ._switch (buffer_choices , "TILING_I" )
253265
0 commit comments