@@ -73,7 +73,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 :
7373
7474// -----
7575
76- // CHECK: #[[$MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2 ], instrShape = [16, 8]}>
76+ // CHECK: #[[$MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 4 ], instrShape = [16, 8]}>
7777#blocked = #triton_gpu.blocked <{sizePerThread = [4 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ]}>
7878#blocked1 = #triton_gpu.blocked <{sizePerThread = [4 , 4 ], threadsPerWarp = [1 , 32 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ]}>
7979#blocked2 = #triton_gpu.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ]}>
@@ -93,7 +93,7 @@ module attributes {"triton_gpu.target" = "cuda:89", "triton_gpu.num-ctas" = 1 :
9393
9494// -----
9595
96- // CHECK-DAG: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4 ], instrShape = [16, 8]}>
96+ // CHECK-DAG: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2 ], instrShape = [16, 8]}>
9797// CHECK-DAG: #[[MMA1:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1, 1], instrShape = [1, 16, 8]}>
9898
9999#blocked = #triton_gpu.blocked <{sizePerThread = [1 , 1 , 1 ], threadsPerWarp = [1 , 2 , 16 ], warpsPerCTA = [1 , 4 , 1 ], order = [2 , 1 , 0 ]}>
@@ -148,7 +148,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 :
148148// -----
149149
150150// Verify that we use mmav2 when the k dim is too small for mmav3.
151- // CHECK: #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 4 ], instrShape = [16, 8]}>
151+ // CHECK: #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 8 ], instrShape = [16, 8]}>
152152#blocked = #triton_gpu.blocked <{sizePerThread = [4 , 4 ], threadsPerWarp = [1 , 32 ], warpsPerCTA = [32 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
153153module attributes {" triton_gpu.target" = " cuda:90" , " triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 32 : i32 , " triton_gpu.threads-per-warp" = 32 : i32 } {
154154 // CHECK-LABEL: small_k_size
0 commit comments