@@ -393,10 +393,12 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
393
393
// CHECK: ttg.async_copy_global_to_local
394
394
// CHECK: ttg.async_commit_group
395
395
// CHECK: scf.for
396
- // CHECK: ttg.async_wait {{.*}} {num = 2 : i32}
396
+ // CHECK: ttg.async_wait {{.*}} {num = 1 : i32}
397
397
// CHECK: %[[NEXT_BUFFER_1:.*]] = tt.addptr %{{.*}}, {{.*}}
398
398
// CHECK: ttg.async_copy_global_to_local %[[NEXT_BUFFER_1]]
399
- // CHECK: %[[IND_BUFFER_0:.*]] = tt.load %{{.*}}, {{.*}}
399
+ // CHECK: ttg.async_wait {{.*}} {num = 1 : i32}
400
+ // CHECK: %[[IND_BUFFER_0_T:.*]] = ttg.local_load
401
+ // CHECK: %[[IND_BUFFER_0:.*]] = tt.unsplat %[[IND_BUFFER_0_T]] : tensor<1xi64
400
402
// CHECK: %[[IND_BUFFER_1:.*]] = arith.muli {{.*}}, %[[IND_BUFFER_0]]
401
403
// CHECK: %[[IND_BUFFER_2:.*]] = tt.splat %[[IND_BUFFER_1]]
402
404
// CHECK: %[[NEXT_BUFFER_0:.*]] = tt.addptr {{.*}}, %[[IND_BUFFER_2]]
@@ -406,9 +408,9 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
406
408
// AMD: %[[LOCAL_ALLOC_0:.*]] = ttg.local_alloc
407
409
// AMD: %[[LOCAL_ALLOC_1:.*]] = ttg.local_alloc
408
410
// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}}
411
+ // AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] {amd.pipeliner_part = "prologue"}
409
412
// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]]
410
413
// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] {amd.pipeliner_part = "prologue"}
411
- // AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]]
412
414
// AMD: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]]
413
415
// AMD: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]]
414
416
// AMD: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]]
@@ -418,29 +420,14 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
418
420
// AMD: ttg.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_11]]
419
421
// AMD: %[[MEMDESC_SUBVIEW_12:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}]
420
422
// AMD: ttg.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_12]]
421
- // AMD: %[[CMPI_13:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}}
422
- // AMD: %[[ADDPTR_14:.*]] = tt.addptr %{{.*}}, %{{.*}}
423
- // AMD: %[[ADDPTR_15:.*]] = tt.addptr %{{.*}}, %{{.*}}
424
- // AMD: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]]
425
- // AMD: %[[LOAD_17:.*]] = tt.load %[[ADDPTR_14]], %[[SPLAT_16]] {amd.pipeliner_part = "prologue"}
426
- // AMD: %[[LOAD_18:.*]] = tt.load %[[ADDPTR_15]], %[[CMPI_13]]
427
- // AMD: %[[MULI_19:.*]] = arith.muli %{{.*}}, %[[LOAD_18]]
428
- // AMD: %[[SPLAT_20:.*]] = tt.splat %[[MULI_19]]
429
- // AMD: %[[ADDPTR_21:.*]] = tt.addptr %{{.*}}, %[[SPLAT_20]]
430
- // AMD: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_13]]
431
- // AMD: %[[LOAD_23:.*]] = tt.load %[[ADDPTR_21]], %[[SPLAT_22]] {amd.pipeliner_part = "prologue"}
432
- // AMD: %[[MEMDESC_SUBVIEW_24:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}]
433
- // AMD: ttg.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_24]]
434
- // AMD: %[[MEMDESC_SUBVIEW_25:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}]
435
- // AMD: ttg.local_store %[[LOAD_23]], %[[MEMDESC_SUBVIEW_25]]
436
423
// AMD: %[[SUBI_26:.*]] = arith.subi %{{.*}}, %{{.*}}
437
- // AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %[[SUBI_26]] step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_14]] , %[[ARG9:.*]] = %[[ADDPTR_15]] , %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_11]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_24]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_12]] , %[[ARG14 :.*]] = %[[MEMDESC_SUBVIEW_25 ]])
424
+ // AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %[[SUBI_26]] step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}} , %[[ARG9:.*]] = %{{.*}} , %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_11]], %[[ARG12:.*]] = %{{.*}} , %[[ARG13 :.*]] = %[[MEMDESC_SUBVIEW_12 ]])
438
425
// AMD: %[[ADDPTR_38:.*]] = tt.addptr %[[ARG8]], %{{.*}}
439
426
// AMD: %[[ADDPTR_39:.*]] = tt.addptr %[[ARG9]], %{{.*}}
440
427
// AMD: %[[LOAD_40:.*]] = tt.load %[[ADDPTR_38]]
441
428
// AMD: %[[LOCAL_LOAD_41:.*]] = ttg.local_load %[[ARG11]]
442
429
// AMD: %[[LOAD_42:.*]] = tt.load %[[ADDPTR_39]]
443
- // AMD: %[[MULI_43:.*]] = arith.muli %{{.*}}, %[[LOAD_42 ]]
430
+ // AMD: %[[MULI_43:.*]] = arith.muli %{{.*}}, %[[ARG12 ]]
444
431
// AMD: %[[SPLAT_44:.*]] = tt.splat %[[MULI_43]]
445
432
// AMD: %[[ADDPTR_45:.*]] = tt.addptr %{{.*}}, %[[SPLAT_44]]
446
433
// AMD: %[[LOAD_46:.*]] = tt.load %[[ADDPTR_45]]
@@ -453,7 +440,7 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
453
440
// AMD: ttg.local_store %[[LOAD_40]], %[[MEMDESC_SUBVIEW_52]]
454
441
// AMD: %[[MEMDESC_SUBVIEW_53:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_51]], %{{.*}}, %{{.*}}]
455
442
// AMD: ttg.local_store %[[LOAD_46]], %[[MEMDESC_SUBVIEW_53]]
456
- // AMD: scf.yield %[[DOT_48]], %[[ADDPTR_38]], %[[ADDPTR_39]], %[[SELECT_51]], %[[ARG12]], %[[ MEMDESC_SUBVIEW_52]], %[[ARG14 ]], %[[MEMDESC_SUBVIEW_53]]
443
+ // AMD: scf.yield %[[DOT_48]], %[[ADDPTR_38]], %[[ADDPTR_39]], %[[SELECT_51]], %[[MEMDESC_SUBVIEW_52]], %[[LOAD_42 ]], %[[MEMDESC_SUBVIEW_53]]
457
444
// AMD: } {tt.num_stages = 3
458
445
// AMD: %[[CMPI_28:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
459
446
// AMD: %[[CMPI_29:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
@@ -466,8 +453,8 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
466
453
// AMD: scf.yield %{{.*}}#0
467
454
// AMD: }
468
455
// AMD: %[[SELECT_33:.*]] = arith.select %[[CMPI_28]], %[[IF_32]], %{{.*}}#0
469
- // AMD: %[[LOCAL_LOAD_34:.*]] = ttg.local_load %{{.*}}#5
470
- // AMD: %[[LOCAL_LOAD_35:.*]] = ttg.local_load %{{.*}}#7
456
+ // AMD: %[[LOCAL_LOAD_34:.*]] = ttg.local_load %{{.*}}
457
+ // AMD: %[[LOCAL_LOAD_35:.*]] = ttg.local_load %{{.*}}
471
458
// AMD: %[[IF_36:.*]] = scf.if %[[CMPI_29]]
472
459
// AMD: %[[DOT_38:.*]] = tt.dot %[[LOCAL_LOAD_34]], %[[LOCAL_LOAD_35]], %[[SELECT_33]]
473
460
// AMD: scf.yield %[[DOT_38]]
@@ -477,34 +464,6 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
477
464
// AMD: %[[SELECT_37:.*]] = arith.select %[[CMPI_29]], %[[IF_36]], %[[SELECT_33]]
478
465
// AMD: ttg.local_dealloc %[[LOCAL_ALLOC_0]]
479
466
// AMD: ttg.local_dealloc %[[LOCAL_ALLOC_1]]
480
-
481
- // AMD_PREFETCH-LABEL: tt.func @indirect_bmm_scalar
482
- // AMD_PREFETCH: ttg.local_alloc
483
- // AMD_PREFETCH: ttg.local_alloc
484
- // AMD_PREFETCH: tt.load
485
- // AMD_PREFETCH: tt.load
486
- // AMD_PREFETCH: tt.load
487
- // AMD_PREFETCH: ttg.local_store
488
- // AMD_PREFETCH: ttg.local_store
489
- // AMD_PREFETCH: tt.load
490
- // AMD_PREFETCH: ttg.local_load
491
- // AMD_PREFETCH: tt.load
492
- // AMD_PREFETCH: tt.load
493
- // AMD_PREFETCH: ttg.local_load
494
- // AMD_PREFETCH: scf.for
495
- // AMD_PREFETCH: ttg.local_store
496
- // AMD_PREFETCH: ttg.local_store
497
- // AMD_PREFETCH: tt.dot
498
- // AMD_PREFETCH: tt.load
499
- // AMD_PREFETCH: ttg.local_load
500
- // AMD_PREFETCH: tt.load
501
- // AMD_PREFETCH: tt.load
502
- // AMD_PREFETCH: ttg.local_load
503
- // AMD_PREFETCH: scf.yield
504
- // AMD_PREFETCH: tt.dot
505
- // AMD_PREFETCH: tt.dot
506
- // AMD_PREFETCH: tt.return
507
-
508
467
tt.func @indirect_bmm_scalar (%77: i64 {tt.divisibility =16 : i32 },
509
468
%76: index ,
510
469
%49: tensor <16 x16 x!tt.ptr <f16 >, #AL > {tt.divisibility =16 : i32 , tt.contiguity =2 : i32 },
0 commit comments