@@ -225,8 +225,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
225225
226226 uint32_t bd_id = bd_op.getBdId ().value ();
227227 int64_t offset = bd_op.getOffsetInBytes ();
228- uint32_t len = bd_op.getLenInBytes ();
229- uint32_t len_addr_granularity = len * 8 / addr_granularity;
228+ uint64_t len = bd_op.getLenInBytes ();
229+ uint64_t len_addr_granularity = len * 8 / addr_granularity;
230230
231231 if (offset * 8 % addr_granularity != 0 ) {
232232 return bd_op->emitOpError (" Offset must be aligned to " )
@@ -253,7 +253,15 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
253253 llvm::SmallVector<int64_t , 4 >(4 , 0 );
254254 std::fill (padBefore.begin (), padBefore.end (), 0 );
255255 std::fill (padAfter.begin (), padAfter.end (), 0 );
256- int d2size = 0 ;
256+
257+ auto d0size = 0 ;
258+ auto d0stride = 0 ;
259+ auto d1size = 0 ;
260+ auto d1stride = 0 ;
261+ auto d2size = 0 ;
262+ auto d2stride = 0 ;
263+ auto iteration_size = 0 ;
264+ auto iteration_stride = 0 ;
257265
258266 if (dims && dims->size () > 0 ) {
259267 llvm::SmallVector<int64_t , 4 > input_sizes =
@@ -273,6 +281,12 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
273281 input_sizes[i] = (*dims)[j].getSize ();
274282 input_strides[i] = (*dims)[j].getStride ();
275283 }
284+
285+ // Do not check input_sizes[3] because a repeat can still be considered a
286+ // linear transfer
287+ bool isLinearTransfer = (input_sizes[0 ] >= 1 ) && (input_sizes[1 ] == 1 ) &&
288+ (input_sizes[2 ] == 1 );
289+
276290 if (dims->size () > 2 ) {
277291 d2size = (target_model.isMemTile (tile.getCol (), tile.getRow ()))
278292 ? (*dims)[2 ].getSize ()
@@ -302,16 +316,43 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
302316 }
303317 getHardwareStridesWraps (target_model, buffer_type, input_sizes,
304318 input_strides, sizes, strides);
319+
305320 if (failed (verifyStridesWraps (bd_op, buffer_type, tile.getCol (),
306321 tile.getRow (), input_sizes, input_strides,
307- sizes, strides))) {
322+ sizes, strides, isLinearTransfer ))) {
308323 return failure ();
309324 }
325+
326+ iteration_size = sizes[3 ];
327+ iteration_stride = strides[3 ];
328+
329+ if (!isLinearTransfer) {
330+ // d0_size, d0_stride
331+ d0size = sizes[0 ];
332+ d0stride = strides[0 ];
333+
334+ // d1_size, d1_stride
335+ d1size = sizes[1 ];
336+ d1stride = strides[1 ];
337+
338+ // d2_stride
339+ d2stride = strides[2 ];
340+ // d2_size set elsewhere
341+ }
342+ if (input_sizes[3 ] > 1 && input_strides[3 ] == 0 ) {
343+ // We allow users to encode the repeat_count as a dimension 3 stride
344+ // of 0. This must lower to a iteration wrap of 0, so no stride is
345+ // ever added. We then repeat the BD using the repeat_count in
346+ // NpuPushQueueOp.
347+ iteration_size = 0 ;
348+ iteration_stride = 0 ;
349+ }
350+
310351 // Ensure the total transfer length and the length expressed in the lowest
311352 // three dimensions of strides/wraps agree. (Fourth dimension is
312353 // iteration/repeat count and repeats the whole BD, so should not be
313354 // incorporated in length of a single BD invocation.)
314- uint32_t len_dims_addr_granularity = 1 ;
355+ uint64_t len_dims_addr_granularity = 1 ;
315356 for (size_t i = 0 ; i < 3 ; i++) {
316357 len_dims_addr_granularity *= sizes[i];
317358 }
@@ -352,11 +393,11 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
352393 bd_op.getLoc (), tile.getCol (), bd_id, len_addr_granularity, offset, 0 ,
353394 0 , 0 , 0 ,
354395 /* TODO: Strides/Wraps */
355- /* d0_size=*/ sizes[ 0 ] , /* d0_stride=*/ strides[ 0 ] ,
356- /* d1_size=*/ sizes[ 1 ] , /* d1_stride=*/ strides[ 1 ] ,
357- /* d2_size=*/ d2size, /* d2_stride=*/ strides[ 2 ] ,
358- /* iteration_current=*/ 0 , /* iteration_size=*/ sizes[ 3 ] ,
359- /* iteration_stride=*/ strides[ 3 ] ,
396+ /* d0_size=*/ d0size , /* d0_stride=*/ d0stride ,
397+ /* d1_size=*/ d1size , /* d1_stride=*/ d1stride ,
398+ /* d2_size=*/ d2size, /* d2_stride=*/ d2stride ,
399+ /* iteration_current=*/ 0 , /* iteration_size=*/ iteration_size ,
400+ /* iteration_stride=*/ iteration_stride ,
360401 /* TODO: Next BD */
361402 /* next_bd=*/ next_bd_id,
362403 /* row=*/ tile.getRow (),
@@ -368,7 +409,6 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
368409 /* d1_zero_before=*/ padBefore[1 ], /* d2_zero_before=*/ padBefore[2 ],
369410 /* d0_zero_after=*/ padAfter[0 ], /* d1_zero_after=*/ padAfter[1 ],
370411 /* d2_zero_after=*/ padAfter[2 ]);
371-
372412 return setAddressForSingleBD (builder, bd_op, tile);
373413 }
374414
0 commit comments