@@ -156,7 +156,7 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp,
156156}
157157
158158void createAsyncCopy (scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
159- Value insertIdx, Value extractIdx,
159+ Value insertIdx, Value extractIdx, int contiguity,
160160 CoarseSchedule &schedule) {
161161 OpBuilderForStage builder (loadOp.getLoc (), forOp, schedule);
162162 Value zero = arith::ConstantIntOp::create (builder, forOp.getLoc (), 0 , 32 );
@@ -176,7 +176,7 @@ void createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
176176 Value view = createSingleBufferView (builder, alloc, insertIdx);
177177 Operation *copy = ttg::AsyncCopyGlobalToLocalOp::create (
178178 builder, src, view, mask, other, loadOp.getCache (), loadOp.getEvict (),
179- loadOp.getIsVolatile ());
179+ loadOp.getIsVolatile (), contiguity );
180180 Operation *commit =
181181 ttg::AsyncCommitGroupOp::create (builder, copy->getResult (0 ));
182182
@@ -274,6 +274,7 @@ void createTMAAsyncGather(scf::ForOp forOp, tt::DescriptorGatherOp gatherOp,
274274
275275struct AsyncLoad {
276276 int stageDiff;
277+ int contiguity = 1 ;
277278 Value alloc;
278279 Value barrier;
279280 Operation *waitOp;
@@ -459,6 +460,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
459460 }
460461 SharedEncodingTrait sharedEncoding;
461462 bool canUseAsyncCp = false ;
463+ int contiguity = 1 ;
462464 if (!isa<RankedTensorType>(op.getResultTypes ()[0 ])) {
463465 canUseAsyncCp = op.getResultTypes ()[0 ].getIntOrFloatBitWidth () >= 32 ;
464466 sharedEncoding = ttg::SwizzledSharedEncodingAttr::get (
@@ -478,6 +480,15 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
478480 cast<RankedTensorType>(op.getResultTypes ()[0 ]), sharedEncoding);
479481
480482 canUseAsyncCp &= copyVecBytes >= 4 ;
483+ if (canUseAsyncCp) {
484+ auto loadOp = cast<tt::LoadOp>(op);
485+ auto ptr = loadOp.getPtr ();
486+ unsigned vec = axisInfoAnalysis.getContiguity (ptr);
487+ if (auto mask = loadOp.getMask ())
488+ vec = std::min<unsigned >(vec,
489+ axisInfoAnalysis.getMaskAlignment (mask));
490+ contiguity = vec;
491+ }
481492 }
482493 if (canUseAsyncCp || isTMALoad (&op)) {
483494 if (loadRequiresAdditionalBuffer (&op)) {
@@ -486,6 +497,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
486497 }
487498 auto &asyncLoad = asyncLoads[&op];
488499 asyncLoad.stageDiff = stageDiff;
500+ asyncLoad.contiguity = contiguity;
489501 asyncLoad.sharedEncoding = sharedEncoding;
490502 } else if (stageDiff > 1 ) {
491503 // Distance-1 loads can in most cases be pipelined in registers without
@@ -589,7 +601,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
589601 auto [insertIdx, extractIdx, phase, _] = loadGroups[asyncLoad.stageDiff ];
590602 if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
591603 createAsyncCopy (forOp, loadOp, asyncLoad.alloc , insertIdx, extractIdx,
592- schedule);
604+ asyncLoad. contiguity , schedule);
593605 hasAsyncLoads = true ;
594606 } else if (auto loadOp = dyn_cast<tt::DescriptorLoadOp>(op)) {
595607 createTMAAsyncLoad (forOp, loadOp, asyncLoad.alloc , insertIdx, extractIdx,
0 commit comments