diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index b32e533b64..aa1293bd49 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -102,6 +102,7 @@ add_llvm_executable(triton-tensor-layout triton-tensor-layout.cpp PARTIAL_SOURCE target_link_libraries(triton-tensor-layout PRIVATE TritonGPUIR TritonNvidiaGPUIR + TritonIntelGPUIR ${triton_libs} ${conversion_libs} ${dialect_libs} diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp index 4087ac1350..b330cfb5aa 100644 --- a/bin/triton-tensor-layout.cpp +++ b/bin/triton-tensor-layout.cpp @@ -80,17 +80,9 @@ static cl::opt TensorStr( //===--------------------------------------------------------------------===// LogicalResult layoutPrint(RankedTensorType tensorType, raw_ostream &os) { - StringRef dialectName = tensorType.getEncoding().getDialect().getNamespace(); - // Dispatch to the corresponding dialect helper function to print the layout. - if (dialectName == "triton_gpu") { - os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView); - return success(); - } - - llvm::errs() << "Unsupported tensor layout attribute: " - << tensorType.getEncoding() << "\n"; - return failure(); + os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView); + return success(); } LogicalResult printLayoutFromFile(MLIRContext *context, StringRef filename, diff --git a/test/Conversion/intel/dot_layout_offset.mlir b/test/Conversion/intel/dot_layout_offset.mlir index 92129848d0..09615f4252 100644 --- a/test/Conversion/intel/dot_layout_offset.mlir +++ b/test/Conversion/intel/dot_layout_offset.mlir @@ -12,317 +12,307 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war // CHECK: %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj // CHECK: %[[THREAD_ID_I32:.*]] = llvm.trunc %[[THREAD_ID_I64]] : i64 to i32 // CHECK: %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_145]] : i32 // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_I32]], %[[VAL_145]] : i32 - // CHECK: %[[VAL_147:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_147]] : i32 + // CHECK-COUNT-3: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[WARP_ID_N:.*]] = llvm.urem %[[WARP_ID]], %[[VAL_149]] : i32 - // CHECK: %[[VAL_151:.*]] = llvm.udiv %[[WARP_ID]], %[[VAL_149]] : i32 + // CHECK: %[[VAL_150:.*]] = llvm.and %[[LANE_ID]], %[[VAL_149]] : i32 + // CHECK: %[[VAL_151:.*]] = llvm.icmp "eq" %[[VAL_150]], %[[CST_0]] : i32 // CHECK: %[[VAL_152:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[WARP_ID_M:.*]] = llvm.urem %[[VAL_151]], %[[VAL_152]] : i32 - // CHECK: %[[VAL_154:.*]] = llvm.udiv %[[VAL_151]], %[[VAL_152]] : i32 + // CHECK: %[[VAL_153:.*]] = llvm.select %[[VAL_151]], %[[CST_0]], %[[VAL_152]] : i1, i32 + // CHECK: %[[VAL_154:.*]] = llvm.xor %[[CST_0]], %[[VAL_153]] : i32 // CHECK: %[[VAL_155:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[ROUNDED_WARP_ID_M:.*]] = llvm.urem %[[WARP_ID_M]], %[[VAL_155]] : i32 - // CHECK: %[[warpShape_M:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[warpOffset:.*]] = llvm.mul %[[ROUNDED_WARP_ID_M]], %[[warpShape_M]] : i32 - // CHECK: %[[VAL_159:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[laneRowIndex:.*]] = llvm.udiv %[[LANE_ID]], %[[VAL_159]] : i32 - // CHECK: %[[VAL_161:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_162:.*]] = llvm.urem %[[LANE_ID]], %[[VAL_161]] : i32 - // CHECK: %[[VAL_163:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[multiDimBase_N:.*]] = llvm.mul %[[VAL_162]], %[[VAL_163]] : i32 - // CHECK: %[[multiDimBase_M:.*]] = llvm.add %[[laneRowIndex]], %[[warpOffset]] : i32 - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_167:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_168:.*]] = llvm.urem %[[VAL_166]], %[[VAL_167]] : i32 - // CHECK: %[[VAL_169:.*]] = llvm.udiv %[[VAL_166]], %[[VAL_167]] : i32 - // CHECK: %[[VAL_170:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_171:.*]] = llvm.urem %[[VAL_169]], %[[VAL_170]] : i32 - // CHECK: %[[VAL_172:.*]] = llvm.udiv %[[VAL_169]], %[[VAL_170]] : i32 - // CHECK: %[[VAL_173:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_174:.*]] = llvm.urem %[[VAL_171]], %[[VAL_173]] : i32 - // CHECK: %[[VAL_175:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_176:.*]] = llvm.urem %[[VAL_168]], %[[VAL_175]] : i32 - // CHECK: %[[VAL_177:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: %[[CTAOffset_M:.*]] = llvm.mul %[[VAL_174]], %[[VAL_177]] : i32 - // CHECK: %[[VAL_179:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: %[[CTAOffset_N:.*]] = llvm.mul %[[VAL_176]], %[[VAL_179]] : i32 - // CHECK: %[[VAL_181:.*]] = llvm.add %[[multiDimBase_M]], %[[CTAOffset_M]] : i32 - // CHECK: %[[VAL_182:.*]] = llvm.add %[[multiDimBase_N]], %[[CTAOffset_N]] : i32 + // CHECK: %[[VAL_156:.*]] = llvm.and %[[LANE_ID]], %[[VAL_155]] : i32 + // CHECK: %[[VAL_157:.*]] = llvm.icmp "eq" %[[VAL_156]], %[[CST_0]] : i32 + // CHECK: %[[VAL_158:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: %[[VAL_159:.*]] = llvm.select %[[VAL_157]], %[[CST_0]], %[[VAL_158]] : i1, i32 + // CHECK: %[[VAL_160:.*]] = llvm.xor %[[VAL_154]], %[[VAL_159]] : i32 + // CHECK: %[[VAL_161:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: %[[VAL_162:.*]] = llvm.and %[[LANE_ID]], %[[VAL_161]] : i32 + // CHECK: %[[VAL_163:.*]] = llvm.icmp "eq" %[[VAL_162]], %[[CST_0]] : i32 + // CHECK: %[[VAL_164:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: %[[VAL_165:.*]] = llvm.select %[[VAL_163]], %[[CST_0]], %[[VAL_164]] : i1, i32 + // CHECK: %[[VAL_182:.*]] = llvm.xor %[[VAL_160]], %[[VAL_165]] : i32 + // CHECK: %[[VAL_167:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK: %[[VAL_168:.*]] = llvm.and %[[LANE_ID]], %[[VAL_167]] : i32 + // CHECK: %[[VAL_169:.*]] = llvm.icmp "eq" %[[VAL_168]], %[[CST_0]] : i32 + // CHECK: %[[VAL_170:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK: %[[VAL_171:.*]] = llvm.select %[[VAL_169]], %[[CST_0]], %[[VAL_170]] : i1, i32 + // CHECK: %[[VAL_181:.*]] = llvm.xor %[[VAL_182]], %[[VAL_171]] : i32 // COM: There are total [4, 2] repetitions of tensor shape [32, 32] per warp. // COM: The repetitions are clustered as [2, 1] for A operand. The repetitions orders are [0, 0], [1, 0], [0, 1], [1, 1], [2, 0], [3, 0], [2, 1], [3, 1] // COM: Offsets of rep [0, 0]. // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_184:.*]] = llvm.add %[[VAL_181]], %[[VAL_183]] : i32 + // CHECK: %[[VAL_184:.*]] = llvm.xor %[[CST_0]], %[[VAL_183]] : i32 // CHECK: %[[VAL_185:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_186:.*]] = llvm.add %[[VAL_182]], %[[VAL_185]] : i32 + // CHECK: %[[VAL_186:.*]] = llvm.xor %[[VAL_181]], %[[VAL_185]] : i32 // CHECK: %[[VAL_187:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_188:.*]] = llvm.add %[[VAL_181]], %[[VAL_187]] : i32 + // CHECK: %[[VAL_188:.*]] = llvm.xor %[[CST_0]], %[[VAL_187]] : i32 // CHECK: %[[VAL_189:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_190:.*]] = llvm.add %[[VAL_182]], %[[VAL_189]] : i32 + // CHECK: %[[VAL_190:.*]] = llvm.xor %[[VAL_181]], %[[VAL_189]] : i32 // CHECK: %[[VAL_191:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[VAL_192:.*]] = llvm.add %[[VAL_181]], %[[VAL_191]] : i32 + // CHECK: %[[VAL_192:.*]] = llvm.xor %[[CST_0]], %[[VAL_191]] : i32 // CHECK: %[[VAL_193:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_194:.*]] = llvm.add %[[VAL_182]], %[[VAL_193]] : i32 + // CHECK: %[[VAL_194:.*]] = llvm.xor %[[VAL_181]], %[[VAL_193]] : i32 // CHECK: %[[VAL_195:.*]] = llvm.mlir.constant(3 : i32) : i32 - // CHECK: %[[VAL_196:.*]] = llvm.add %[[VAL_181]], %[[VAL_195]] : i32 + // CHECK: %[[VAL_196:.*]] = llvm.xor %[[CST_0]], %[[VAL_195]] : i32 // CHECK: %[[VAL_197:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_198:.*]] = llvm.add %[[VAL_182]], %[[VAL_197]] : i32 + // CHECK: %[[VAL_198:.*]] = llvm.xor %[[VAL_181]], %[[VAL_197]] : i32 // CHECK: %[[VAL_199:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: %[[VAL_200:.*]] = llvm.add %[[VAL_181]], %[[VAL_199]] : i32 + // CHECK: %[[VAL_200:.*]] = llvm.xor %[[CST_0]], %[[VAL_199]] : i32 // CHECK: %[[VAL_201:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_202:.*]] = llvm.add %[[VAL_182]], %[[VAL_201]] : i32 + // CHECK: %[[VAL_202:.*]] = llvm.xor %[[VAL_181]], %[[VAL_201]] : i32 // CHECK: %[[VAL_203:.*]] = llvm.mlir.constant(5 : i32) : i32 - // CHECK: %[[VAL_204:.*]] = llvm.add %[[VAL_181]], %[[VAL_203]] : i32 + // CHECK: %[[VAL_204:.*]] = llvm.xor %[[CST_0]], %[[VAL_203]] : i32 // CHECK: %[[VAL_205:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_206:.*]] = llvm.add %[[VAL_182]], %[[VAL_205]] : i32 + // CHECK: %[[VAL_206:.*]] = llvm.xor %[[VAL_181]], %[[VAL_205]] : i32 // CHECK: %[[VAL_207:.*]] = llvm.mlir.constant(6 : i32) : i32 - // CHECK: %[[VAL_208:.*]] = llvm.add %[[VAL_181]], %[[VAL_207]] : i32 + // CHECK: %[[VAL_208:.*]] = llvm.xor %[[CST_0]], %[[VAL_207]] : i32 // CHECK: %[[VAL_209:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_210:.*]] = llvm.add %[[VAL_182]], %[[VAL_209]] : i32 + // CHECK: %[[VAL_210:.*]] = llvm.xor %[[VAL_181]], %[[VAL_209]] : i32 // CHECK: %[[VAL_211:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_212:.*]] = llvm.add %[[VAL_181]], %[[VAL_211]] : i32 + // CHECK: %[[VAL_212:.*]] = llvm.xor %[[CST_0]], %[[VAL_211]] : i32 // CHECK: %[[VAL_213:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_214:.*]] = llvm.add %[[VAL_182]], %[[VAL_213]] : i32 + // CHECK: %[[VAL_214:.*]] = llvm.xor %[[VAL_181]], %[[VAL_213]] : i32 // COM: Offsets of rep [1, 0]. // CHECK: %[[VAL_215:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VAL_216:.*]] = llvm.add %[[VAL_181]], %[[VAL_215]] : i32 + // CHECK: %[[VAL_216:.*]] = llvm.xor %[[CST_0]], %[[VAL_215]] : i32 // CHECK: %[[VAL_217:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_218:.*]] = llvm.add %[[VAL_182]], %[[VAL_217]] : i32 + // CHECK: %[[VAL_218:.*]] = llvm.xor %[[VAL_181]], %[[VAL_217]] : i32 // CHECK: %[[VAL_219:.*]] = llvm.mlir.constant(9 : i32) : i32 - // CHECK: %[[VAL_220:.*]] = llvm.add %[[VAL_181]], %[[VAL_219]] : i32 + // CHECK: %[[VAL_220:.*]] = llvm.xor %[[CST_0]], %[[VAL_219]] : i32 // CHECK: %[[VAL_221:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_222:.*]] = llvm.add %[[VAL_182]], %[[VAL_221]] : i32 + // CHECK: %[[VAL_222:.*]] = llvm.xor %[[VAL_181]], %[[VAL_221]] : i32 // CHECK: %[[VAL_223:.*]] = llvm.mlir.constant(10 : i32) : i32 - // CHECK: %[[VAL_224:.*]] = llvm.add %[[VAL_181]], %[[VAL_223]] : i32 + // CHECK: %[[VAL_224:.*]] = llvm.xor %[[CST_0]], %[[VAL_223]] : i32 // CHECK: %[[VAL_225:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_226:.*]] = llvm.add %[[VAL_182]], %[[VAL_225]] : i32 + // CHECK: %[[VAL_226:.*]] = llvm.xor %[[VAL_181]], %[[VAL_225]] : i32 // CHECK: %[[VAL_227:.*]] = llvm.mlir.constant(11 : i32) : i32 - // CHECK: %[[VAL_228:.*]] = llvm.add %[[VAL_181]], %[[VAL_227]] : i32 + // CHECK: %[[VAL_228:.*]] = llvm.xor %[[CST_0]], %[[VAL_227]] : i32 // CHECK: %[[VAL_229:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_230:.*]] = llvm.add %[[VAL_182]], %[[VAL_229]] : i32 + // CHECK: %[[VAL_230:.*]] = llvm.xor %[[VAL_181]], %[[VAL_229]] : i32 // CHECK: %[[VAL_231:.*]] = llvm.mlir.constant(12 : i32) : i32 - // CHECK: %[[VAL_232:.*]] = llvm.add %[[VAL_181]], %[[VAL_231]] : i32 + // CHECK: %[[VAL_232:.*]] = llvm.xor %[[CST_0]], %[[VAL_231]] : i32 // CHECK: %[[VAL_233:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_234:.*]] = llvm.add %[[VAL_182]], %[[VAL_233]] : i32 + // CHECK: %[[VAL_234:.*]] = llvm.xor %[[VAL_181]], %[[VAL_233]] : i32 // CHECK: %[[VAL_235:.*]] = llvm.mlir.constant(13 : i32) : i32 - // CHECK: %[[VAL_236:.*]] = llvm.add %[[VAL_181]], %[[VAL_235]] : i32 + // CHECK: %[[VAL_236:.*]] = llvm.xor %[[CST_0]], %[[VAL_235]] : i32 // CHECK: %[[VAL_237:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_238:.*]] = llvm.add %[[VAL_182]], %[[VAL_237]] : i32 + // CHECK: %[[VAL_238:.*]] = llvm.xor %[[VAL_181]], %[[VAL_237]] : i32 // CHECK: %[[VAL_239:.*]] = llvm.mlir.constant(14 : i32) : i32 - // CHECK: %[[VAL_240:.*]] = llvm.add %[[VAL_181]], %[[VAL_239]] : i32 + // CHECK: %[[VAL_240:.*]] = llvm.xor %[[CST_0]], %[[VAL_239]] : i32 // CHECK: %[[VAL_241:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_242:.*]] = llvm.add %[[VAL_182]], %[[VAL_241]] : i32 + // CHECK: %[[VAL_242:.*]] = llvm.xor %[[VAL_181]], %[[VAL_241]] : i32 // CHECK: %[[VAL_243:.*]] = llvm.mlir.constant(15 : i32) : i32 - // CHECK: %[[VAL_244:.*]] = llvm.add %[[VAL_181]], %[[VAL_243]] : i32 + // CHECK: %[[VAL_244:.*]] = llvm.xor %[[CST_0]], %[[VAL_243]] : i32 // CHECK: %[[VAL_245:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_246:.*]] = llvm.add %[[VAL_182]], %[[VAL_245]] : i32 + // CHECK: %[[VAL_246:.*]] = llvm.xor %[[VAL_181]], %[[VAL_245]] : i32 // COM: Offsets of rep [0, 1]. // CHECK: %[[VAL_247:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_248:.*]] = llvm.add %[[VAL_181]], %[[VAL_247]] : i32 + // CHECK: %[[VAL_248:.*]] = llvm.xor %[[CST_0]], %[[VAL_247]] : i32 // CHECK: %[[VAL_249:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_250:.*]] = llvm.add %[[VAL_182]], %[[VAL_249]] : i32 + // CHECK: %[[VAL_250:.*]] = llvm.xor %[[VAL_181]], %[[VAL_249]] : i32 // CHECK: %[[VAL_251:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_252:.*]] = llvm.add %[[VAL_181]], %[[VAL_251]] : i32 + // CHECK: %[[VAL_252:.*]] = llvm.xor %[[CST_0]], %[[VAL_251]] : i32 // CHECK: %[[VAL_253:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_254:.*]] = llvm.add %[[VAL_182]], %[[VAL_253]] : i32 + // CHECK: %[[VAL_254:.*]] = llvm.xor %[[VAL_181]], %[[VAL_253]] : i32 // CHECK: %[[VAL_255:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[VAL_256:.*]] = llvm.add %[[VAL_181]], %[[VAL_255]] : i32 + // CHECK: %[[VAL_256:.*]] = llvm.xor %[[CST_0]], %[[VAL_255]] : i32 // CHECK: %[[VAL_257:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_258:.*]] = llvm.add %[[VAL_182]], %[[VAL_257]] : i32 + // CHECK: %[[VAL_258:.*]] = llvm.xor %[[VAL_181]], %[[VAL_257]] : i32 // CHECK: %[[VAL_259:.*]] = llvm.mlir.constant(3 : i32) : i32 - // CHECK: %[[VAL_260:.*]] = llvm.add %[[VAL_181]], %[[VAL_259]] : i32 + // CHECK: %[[VAL_260:.*]] = llvm.xor %[[CST_0]], %[[VAL_259]] : i32 // CHECK: %[[VAL_261:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_262:.*]] = llvm.add %[[VAL_182]], %[[VAL_261]] : i32 + // CHECK: %[[VAL_262:.*]] = llvm.xor %[[VAL_181]], %[[VAL_261]] : i32 // CHECK: %[[VAL_263:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: %[[VAL_264:.*]] = llvm.add %[[VAL_181]], %[[VAL_263]] : i32 + // CHECK: %[[VAL_264:.*]] = llvm.xor %[[CST_0]], %[[VAL_263]] : i32 // CHECK: %[[VAL_265:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_266:.*]] = llvm.add %[[VAL_182]], %[[VAL_265]] : i32 + // CHECK: %[[VAL_266:.*]] = llvm.xor %[[VAL_181]], %[[VAL_265]] : i32 // CHECK: %[[VAL_267:.*]] = llvm.mlir.constant(5 : i32) : i32 - // CHECK: %[[VAL_268:.*]] = llvm.add %[[VAL_181]], %[[VAL_267]] : i32 + // CHECK: %[[VAL_268:.*]] = llvm.xor %[[CST_0]], %[[VAL_267]] : i32 // CHECK: %[[VAL_269:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_270:.*]] = llvm.add %[[VAL_182]], %[[VAL_269]] : i32 + // CHECK: %[[VAL_270:.*]] = llvm.xor %[[VAL_181]], %[[VAL_269]] : i32 // CHECK: %[[VAL_271:.*]] = llvm.mlir.constant(6 : i32) : i32 - // CHECK: %[[VAL_272:.*]] = llvm.add %[[VAL_181]], %[[VAL_271]] : i32 + // CHECK: %[[VAL_272:.*]] = llvm.xor %[[CST_0]], %[[VAL_271]] : i32 // CHECK: %[[VAL_273:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_274:.*]] = llvm.add %[[VAL_182]], %[[VAL_273]] : i32 + // CHECK: %[[VAL_274:.*]] = llvm.xor %[[VAL_181]], %[[VAL_273]] : i32 // CHECK: %[[VAL_275:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_276:.*]] = llvm.add %[[VAL_181]], %[[VAL_275]] : i32 + // CHECK: %[[VAL_276:.*]] = llvm.xor %[[CST_0]], %[[VAL_275]] : i32 // CHECK: %[[VAL_277:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_278:.*]] = llvm.add %[[VAL_182]], %[[VAL_277]] : i32 + // CHECK: %[[VAL_278:.*]] = llvm.xor %[[VAL_181]], %[[VAL_277]] : i32 // COM: Offsets of rep [1, 1]. // CHECK: %[[VAL_279:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VAL_280:.*]] = llvm.add %[[VAL_181]], %[[VAL_279]] : i32 + // CHECK: %[[VAL_280:.*]] = llvm.xor %[[CST_0]], %[[VAL_279]] : i32 // CHECK: %[[VAL_281:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_282:.*]] = llvm.add %[[VAL_182]], %[[VAL_281]] : i32 + // CHECK: %[[VAL_282:.*]] = llvm.xor %[[VAL_181]], %[[VAL_281]] : i32 // CHECK: %[[VAL_283:.*]] = llvm.mlir.constant(9 : i32) : i32 - // CHECK: %[[VAL_284:.*]] = llvm.add %[[VAL_181]], %[[VAL_283]] : i32 + // CHECK: %[[VAL_284:.*]] = llvm.xor %[[CST_0]], %[[VAL_283]] : i32 // CHECK: %[[VAL_285:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_286:.*]] = llvm.add %[[VAL_182]], %[[VAL_285]] : i32 + // CHECK: %[[VAL_286:.*]] = llvm.xor %[[VAL_181]], %[[VAL_285]] : i32 // CHECK: %[[VAL_287:.*]] = llvm.mlir.constant(10 : i32) : i32 - // CHECK: %[[VAL_288:.*]] = llvm.add %[[VAL_181]], %[[VAL_287]] : i32 + // CHECK: %[[VAL_288:.*]] = llvm.xor %[[CST_0]], %[[VAL_287]] : i32 // CHECK: %[[VAL_289:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_290:.*]] = llvm.add %[[VAL_182]], %[[VAL_289]] : i32 + // CHECK: %[[VAL_290:.*]] = llvm.xor %[[VAL_181]], %[[VAL_289]] : i32 // CHECK: %[[VAL_291:.*]] = llvm.mlir.constant(11 : i32) : i32 - // CHECK: %[[VAL_292:.*]] = llvm.add %[[VAL_181]], %[[VAL_291]] : i32 + // CHECK: %[[VAL_292:.*]] = llvm.xor %[[CST_0]], %[[VAL_291]] : i32 // CHECK: %[[VAL_293:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_294:.*]] = llvm.add %[[VAL_182]], %[[VAL_293]] : i32 + // CHECK: %[[VAL_294:.*]] = llvm.xor %[[VAL_181]], %[[VAL_293]] : i32 // CHECK: %[[VAL_295:.*]] = llvm.mlir.constant(12 : i32) : i32 - // CHECK: %[[VAL_296:.*]] = llvm.add %[[VAL_181]], %[[VAL_295]] : i32 + // CHECK: %[[VAL_296:.*]] = llvm.xor %[[CST_0]], %[[VAL_295]] : i32 // CHECK: %[[VAL_297:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_298:.*]] = llvm.add %[[VAL_182]], %[[VAL_297]] : i32 + // CHECK: %[[VAL_298:.*]] = llvm.xor %[[VAL_181]], %[[VAL_297]] : i32 // CHECK: %[[VAL_299:.*]] = llvm.mlir.constant(13 : i32) : i32 - // CHECK: %[[VAL_300:.*]] = llvm.add %[[VAL_181]], %[[VAL_299]] : i32 + // CHECK: %[[VAL_300:.*]] = llvm.xor %[[CST_0]], %[[VAL_299]] : i32 // CHECK: %[[VAL_301:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_302:.*]] = llvm.add %[[VAL_182]], %[[VAL_301]] : i32 + // CHECK: %[[VAL_302:.*]] = llvm.xor %[[VAL_181]], %[[VAL_301]] : i32 // CHECK: %[[VAL_303:.*]] = llvm.mlir.constant(14 : i32) : i32 - // CHECK: %[[VAL_304:.*]] = llvm.add %[[VAL_181]], %[[VAL_303]] : i32 + // CHECK: %[[VAL_304:.*]] = llvm.xor %[[CST_0]], %[[VAL_303]] : i32 // CHECK: %[[VAL_305:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_306:.*]] = llvm.add %[[VAL_182]], %[[VAL_305]] : i32 + // CHECK: %[[VAL_306:.*]] = llvm.xor %[[VAL_181]], %[[VAL_305]] : i32 // CHECK: %[[VAL_307:.*]] = llvm.mlir.constant(15 : i32) : i32 - // CHECK: %[[VAL_308:.*]] = llvm.add %[[VAL_181]], %[[VAL_307]] : i32 + // CHECK: %[[VAL_308:.*]] = llvm.xor %[[CST_0]], %[[VAL_307]] : i32 // CHECK: %[[VAL_309:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_310:.*]] = llvm.add %[[VAL_182]], %[[VAL_309]] : i32 + // CHECK: %[[VAL_310:.*]] = llvm.xor %[[VAL_181]], %[[VAL_309]] : i32 // COM: Offsets of rep [2, 0]. // CHECK: %[[VAL_311:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_312:.*]] = llvm.add %[[VAL_181]], %[[VAL_311]] : i32 + // CHECK: %[[VAL_312:.*]] = llvm.xor %[[CST_0]], %[[VAL_311]] : i32 // CHECK: %[[VAL_313:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_314:.*]] = llvm.add %[[VAL_182]], %[[VAL_313]] : i32 + // CHECK: %[[VAL_314:.*]] = llvm.xor %[[VAL_181]], %[[VAL_313]] : i32 // CHECK: %[[VAL_315:.*]] = llvm.mlir.constant(17 : i32) : i32 - // CHECK: %[[VAL_316:.*]] = llvm.add %[[VAL_181]], %[[VAL_315]] : i32 + // CHECK: %[[VAL_316:.*]] = llvm.xor %[[CST_0]], %[[VAL_315]] : i32 // CHECK: %[[VAL_317:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_318:.*]] = llvm.add %[[VAL_182]], %[[VAL_317]] : i32 + // CHECK: %[[VAL_318:.*]] = llvm.xor %[[VAL_181]], %[[VAL_317]] : i32 // CHECK: %[[VAL_319:.*]] = llvm.mlir.constant(18 : i32) : i32 - // CHECK: %[[VAL_320:.*]] = llvm.add %[[VAL_181]], %[[VAL_319]] : i32 + // CHECK: %[[VAL_320:.*]] = llvm.xor %[[CST_0]], %[[VAL_319]] : i32 // CHECK: %[[VAL_321:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_322:.*]] = llvm.add %[[VAL_182]], %[[VAL_321]] : i32 + // CHECK: %[[VAL_322:.*]] = llvm.xor %[[VAL_181]], %[[VAL_321]] : i32 // CHECK: %[[VAL_323:.*]] = llvm.mlir.constant(19 : i32) : i32 - // CHECK: %[[VAL_324:.*]] = llvm.add %[[VAL_181]], %[[VAL_323]] : i32 + // CHECK: %[[VAL_324:.*]] = llvm.xor %[[CST_0]], %[[VAL_323]] : i32 // CHECK: %[[VAL_325:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_326:.*]] = llvm.add %[[VAL_182]], %[[VAL_325]] : i32 + // CHECK: %[[VAL_326:.*]] = llvm.xor %[[VAL_181]], %[[VAL_325]] : i32 // CHECK: %[[VAL_327:.*]] = llvm.mlir.constant(20 : i32) : i32 - // CHECK: %[[VAL_328:.*]] = llvm.add %[[VAL_181]], %[[VAL_327]] : i32 + // CHECK: %[[VAL_328:.*]] = llvm.xor %[[CST_0]], %[[VAL_327]] : i32 // CHECK: %[[VAL_329:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_330:.*]] = llvm.add %[[VAL_182]], %[[VAL_329]] : i32 + // CHECK: %[[VAL_330:.*]] = llvm.xor %[[VAL_181]], %[[VAL_329]] : i32 // CHECK: %[[VAL_331:.*]] = llvm.mlir.constant(21 : i32) : i32 - // CHECK: %[[VAL_332:.*]] = llvm.add %[[VAL_181]], %[[VAL_331]] : i32 + // CHECK: %[[VAL_332:.*]] = llvm.xor %[[CST_0]], %[[VAL_331]] : i32 // CHECK: %[[VAL_333:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_334:.*]] = llvm.add %[[VAL_182]], %[[VAL_333]] : i32 + // CHECK: %[[VAL_334:.*]] = llvm.xor %[[VAL_181]], %[[VAL_333]] : i32 // CHECK: %[[VAL_335:.*]] = llvm.mlir.constant(22 : i32) : i32 - // CHECK: %[[VAL_336:.*]] = llvm.add %[[VAL_181]], %[[VAL_335]] : i32 + // CHECK: %[[VAL_336:.*]] = llvm.xor %[[CST_0]], %[[VAL_335]] : i32 // CHECK: %[[VAL_337:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_338:.*]] = llvm.add %[[VAL_182]], %[[VAL_337]] : i32 + // CHECK: %[[VAL_338:.*]] = llvm.xor %[[VAL_181]], %[[VAL_337]] : i32 // CHECK: %[[VAL_339:.*]] = llvm.mlir.constant(23 : i32) : i32 - // CHECK: %[[VAL_340:.*]] = llvm.add %[[VAL_181]], %[[VAL_339]] : i32 + // CHECK: %[[VAL_340:.*]] = llvm.xor %[[CST_0]], %[[VAL_339]] : i32 // CHECK: %[[VAL_341:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_342:.*]] = llvm.add %[[VAL_182]], %[[VAL_341]] : i32 + // CHECK: %[[VAL_342:.*]] = llvm.xor %[[VAL_181]], %[[VAL_341]] : i32 // COM: Offsets of rep [3, 0]. // CHECK: %[[VAL_343:.*]] = llvm.mlir.constant(24 : i32) : i32 - // CHECK: %[[VAL_344:.*]] = llvm.add %[[VAL_181]], %[[VAL_343]] : i32 + // CHECK: %[[VAL_344:.*]] = llvm.xor %[[CST_0]], %[[VAL_343]] : i32 // CHECK: %[[VAL_345:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_346:.*]] = llvm.add %[[VAL_182]], %[[VAL_345]] : i32 + // CHECK: %[[VAL_346:.*]] = llvm.xor %[[VAL_181]], %[[VAL_345]] : i32 // CHECK: %[[VAL_347:.*]] = llvm.mlir.constant(25 : i32) : i32 - // CHECK: %[[VAL_348:.*]] = llvm.add %[[VAL_181]], %[[VAL_347]] : i32 + // CHECK: %[[VAL_348:.*]] = llvm.xor %[[CST_0]], %[[VAL_347]] : i32 // CHECK: %[[VAL_349:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_350:.*]] = llvm.add %[[VAL_182]], %[[VAL_349]] : i32 + // CHECK: %[[VAL_350:.*]] = llvm.xor %[[VAL_181]], %[[VAL_349]] : i32 // CHECK: %[[VAL_351:.*]] = llvm.mlir.constant(26 : i32) : i32 - // CHECK: %[[VAL_352:.*]] = llvm.add %[[VAL_181]], %[[VAL_351]] : i32 + // CHECK: %[[VAL_352:.*]] = llvm.xor %[[CST_0]], %[[VAL_351]] : i32 // CHECK: %[[VAL_353:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_354:.*]] = llvm.add %[[VAL_182]], %[[VAL_353]] : i32 + // CHECK: %[[VAL_354:.*]] = llvm.xor %[[VAL_181]], %[[VAL_353]] : i32 // CHECK: %[[VAL_355:.*]] = llvm.mlir.constant(27 : i32) : i32 - // CHECK: %[[VAL_356:.*]] = llvm.add %[[VAL_181]], %[[VAL_355]] : i32 + // CHECK: %[[VAL_356:.*]] = llvm.xor %[[CST_0]], %[[VAL_355]] : i32 // CHECK: %[[VAL_357:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_358:.*]] = llvm.add %[[VAL_182]], %[[VAL_357]] : i32 + // CHECK: %[[VAL_358:.*]] = llvm.xor %[[VAL_181]], %[[VAL_357]] : i32 // CHECK: %[[VAL_359:.*]] = llvm.mlir.constant(28 : i32) : i32 - // CHECK: %[[VAL_360:.*]] = llvm.add %[[VAL_181]], %[[VAL_359]] : i32 + // CHECK: %[[VAL_360:.*]] = llvm.xor %[[CST_0]], %[[VAL_359]] : i32 // CHECK: %[[VAL_361:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_362:.*]] = llvm.add %[[VAL_182]], %[[VAL_361]] : i32 + // CHECK: %[[VAL_362:.*]] = llvm.xor %[[VAL_181]], %[[VAL_361]] : i32 // CHECK: %[[VAL_363:.*]] = llvm.mlir.constant(29 : i32) : i32 - // CHECK: %[[VAL_364:.*]] = llvm.add %[[VAL_181]], %[[VAL_363]] : i32 + // CHECK: %[[VAL_364:.*]] = llvm.xor %[[CST_0]], %[[VAL_363]] : i32 // CHECK: %[[VAL_365:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_366:.*]] = llvm.add %[[VAL_182]], %[[VAL_365]] : i32 + // CHECK: %[[VAL_366:.*]] = llvm.xor %[[VAL_181]], %[[VAL_365]] : i32 // CHECK: %[[VAL_367:.*]] = llvm.mlir.constant(30 : i32) : i32 - // CHECK: %[[VAL_368:.*]] = llvm.add %[[VAL_181]], %[[VAL_367]] : i32 + // CHECK: %[[VAL_368:.*]] = llvm.xor %[[CST_0]], %[[VAL_367]] : i32 // CHECK: %[[VAL_369:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_370:.*]] = llvm.add %[[VAL_182]], %[[VAL_369]] : i32 + // CHECK: %[[VAL_370:.*]] = llvm.xor %[[VAL_181]], %[[VAL_369]] : i32 // CHECK: %[[VAL_371:.*]] = llvm.mlir.constant(31 : i32) : i32 - // CHECK: %[[VAL_372:.*]] = llvm.add %[[VAL_181]], %[[VAL_371]] : i32 + // CHECK: %[[VAL_372:.*]] = llvm.xor %[[CST_0]], %[[VAL_371]] : i32 // CHECK: %[[VAL_373:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_374:.*]] = llvm.add %[[VAL_182]], %[[VAL_373]] : i32 + // CHECK: %[[VAL_374:.*]] = llvm.xor %[[VAL_181]], %[[VAL_373]] : i32 // COM: Offsets of rep [2, 1]. // CHECK: %[[VAL_375:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_376:.*]] = llvm.add %[[VAL_181]], %[[VAL_375]] : i32 + // CHECK: %[[VAL_376:.*]] = llvm.xor %[[CST_0]], %[[VAL_375]] : i32 // CHECK: %[[VAL_377:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_378:.*]] = llvm.add %[[VAL_182]], %[[VAL_377]] : i32 + // CHECK: %[[VAL_378:.*]] = llvm.xor %[[VAL_181]], %[[VAL_377]] : i32 // CHECK: %[[VAL_379:.*]] = llvm.mlir.constant(17 : i32) : i32 - // CHECK: %[[VAL_380:.*]] = llvm.add %[[VAL_181]], %[[VAL_379]] : i32 + // CHECK: %[[VAL_380:.*]] = llvm.xor %[[CST_0]], %[[VAL_379]] : i32 // CHECK: %[[VAL_381:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_382:.*]] = llvm.add %[[VAL_182]], %[[VAL_381]] : i32 + // CHECK: %[[VAL_382:.*]] = llvm.xor %[[VAL_181]], %[[VAL_381]] : i32 // CHECK: %[[VAL_383:.*]] = llvm.mlir.constant(18 : i32) : i32 - // CHECK: %[[VAL_384:.*]] = llvm.add %[[VAL_181]], %[[VAL_383]] : i32 + // CHECK: %[[VAL_384:.*]] = llvm.xor %[[CST_0]], %[[VAL_383]] : i32 // CHECK: %[[VAL_385:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_386:.*]] = llvm.add %[[VAL_182]], %[[VAL_385]] : i32 + // CHECK: %[[VAL_386:.*]] = llvm.xor %[[VAL_181]], %[[VAL_385]] : i32 // CHECK: %[[VAL_387:.*]] = llvm.mlir.constant(19 : i32) : i32 - // CHECK: %[[VAL_388:.*]] = llvm.add %[[VAL_181]], %[[VAL_387]] : i32 + // CHECK: %[[VAL_388:.*]] = llvm.xor %[[CST_0]], %[[VAL_387]] : i32 // CHECK: %[[VAL_389:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_390:.*]] = llvm.add %[[VAL_182]], %[[VAL_389]] : i32 + // CHECK: %[[VAL_390:.*]] = llvm.xor %[[VAL_181]], %[[VAL_389]] : i32 // CHECK: %[[VAL_391:.*]] = llvm.mlir.constant(20 : i32) : i32 - // CHECK: %[[VAL_392:.*]] = llvm.add %[[VAL_181]], %[[VAL_391]] : i32 + // CHECK: %[[VAL_392:.*]] = llvm.xor %[[CST_0]], %[[VAL_391]] : i32 // CHECK: %[[VAL_393:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_394:.*]] = llvm.add %[[VAL_182]], %[[VAL_393]] : i32 + // CHECK: %[[VAL_394:.*]] = llvm.xor %[[VAL_181]], %[[VAL_393]] : i32 // CHECK: %[[VAL_395:.*]] = llvm.mlir.constant(21 : i32) : i32 - // CHECK: %[[VAL_396:.*]] = llvm.add %[[VAL_181]], %[[VAL_395]] : i32 + // CHECK: %[[VAL_396:.*]] = llvm.xor %[[CST_0]], %[[VAL_395]] : i32 // CHECK: %[[VAL_397:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_398:.*]] = llvm.add %[[VAL_182]], %[[VAL_397]] : i32 + // CHECK: %[[VAL_398:.*]] = llvm.xor %[[VAL_181]], %[[VAL_397]] : i32 // CHECK: %[[VAL_399:.*]] = llvm.mlir.constant(22 : i32) : i32 - // CHECK: %[[VAL_400:.*]] = llvm.add %[[VAL_181]], %[[VAL_399]] : i32 + // CHECK: %[[VAL_400:.*]] = llvm.xor %[[CST_0]], %[[VAL_399]] : i32 // CHECK: %[[VAL_401:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_402:.*]] = llvm.add %[[VAL_182]], %[[VAL_401]] : i32 + // CHECK: %[[VAL_402:.*]] = llvm.xor %[[VAL_181]], %[[VAL_401]] : i32 // CHECK: %[[VAL_403:.*]] = llvm.mlir.constant(23 : i32) : i32 - // CHECK: %[[VAL_404:.*]] = llvm.add %[[VAL_181]], %[[VAL_403]] : i32 + // CHECK: %[[VAL_404:.*]] = llvm.xor %[[CST_0]], %[[VAL_403]] : i32 // CHECK: %[[VAL_405:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_406:.*]] = llvm.add %[[VAL_182]], %[[VAL_405]] : i32 + // CHECK: %[[VAL_406:.*]] = llvm.xor %[[VAL_181]], %[[VAL_405]] : i32 // COM: Offsets of rep [2, 2]. // CHECK: %[[VAL_407:.*]] = llvm.mlir.constant(24 : i32) : i32 - // CHECK: %[[VAL_408:.*]] = llvm.add %[[VAL_181]], %[[VAL_407]] : i32 + // CHECK: %[[VAL_408:.*]] = llvm.xor %[[CST_0]], %[[VAL_407]] : i32 // CHECK: %[[VAL_409:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_410:.*]] = llvm.add %[[VAL_182]], %[[VAL_409]] : i32 + // CHECK: %[[VAL_410:.*]] = llvm.xor %[[VAL_181]], %[[VAL_409]] : i32 // CHECK: %[[VAL_411:.*]] = llvm.mlir.constant(25 : i32) : i32 - // CHECK: %[[VAL_412:.*]] = llvm.add %[[VAL_181]], %[[VAL_411]] : i32 + // CHECK: %[[VAL_412:.*]] = llvm.xor %[[CST_0]], %[[VAL_411]] : i32 // CHECK: %[[VAL_413:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_414:.*]] = llvm.add %[[VAL_182]], %[[VAL_413]] : i32 + // CHECK: %[[VAL_414:.*]] = llvm.xor %[[VAL_181]], %[[VAL_413]] : i32 // CHECK: %[[VAL_415:.*]] = llvm.mlir.constant(26 : i32) : i32 - // CHECK: %[[VAL_416:.*]] = llvm.add %[[VAL_181]], %[[VAL_415]] : i32 + // CHECK: %[[VAL_416:.*]] = llvm.xor %[[CST_0]], %[[VAL_415]] : i32 // CHECK: %[[VAL_417:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_418:.*]] = llvm.add %[[VAL_182]], %[[VAL_417]] : i32 + // CHECK: %[[VAL_418:.*]] = llvm.xor %[[VAL_181]], %[[VAL_417]] : i32 // CHECK: %[[VAL_419:.*]] = llvm.mlir.constant(27 : i32) : i32 - // CHECK: %[[VAL_420:.*]] = llvm.add %[[VAL_181]], %[[VAL_419]] : i32 + // CHECK: %[[VAL_420:.*]] = llvm.xor %[[CST_0]], %[[VAL_419]] : i32 // CHECK: %[[VAL_421:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_422:.*]] = llvm.add %[[VAL_182]], %[[VAL_421]] : i32 + // CHECK: %[[VAL_422:.*]] = llvm.xor %[[VAL_181]], %[[VAL_421]] : i32 // CHECK: %[[VAL_423:.*]] = llvm.mlir.constant(28 : i32) : i32 - // CHECK: %[[VAL_424:.*]] = llvm.add %[[VAL_181]], %[[VAL_423]] : i32 + // CHECK: %[[VAL_424:.*]] = llvm.xor %[[CST_0]], %[[VAL_423]] : i32 // CHECK: %[[VAL_425:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_426:.*]] = llvm.add %[[VAL_182]], %[[VAL_425]] : i32 + // CHECK: %[[VAL_426:.*]] = llvm.xor %[[VAL_181]], %[[VAL_425]] : i32 // CHECK: %[[VAL_427:.*]] = llvm.mlir.constant(29 : i32) : i32 - // CHECK: %[[VAL_428:.*]] = llvm.add %[[VAL_181]], %[[VAL_427]] : i32 + // CHECK: %[[VAL_428:.*]] = llvm.xor %[[CST_0]], %[[VAL_427]] : i32 // CHECK: %[[VAL_429:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_430:.*]] = llvm.add %[[VAL_182]], %[[VAL_429]] : i32 + // CHECK: %[[VAL_430:.*]] = llvm.xor %[[VAL_181]], %[[VAL_429]] : i32 // CHECK: %[[VAL_431:.*]] = llvm.mlir.constant(30 : i32) : i32 - // CHECK: %[[VAL_432:.*]] = llvm.add %[[VAL_181]], %[[VAL_431]] : i32 + // CHECK: %[[VAL_432:.*]] = llvm.xor %[[CST_0]], %[[VAL_431]] : i32 // CHECK: %[[VAL_433:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_434:.*]] = llvm.add %[[VAL_182]], %[[VAL_433]] : i32 + // CHECK: %[[VAL_434:.*]] = llvm.xor %[[VAL_181]], %[[VAL_433]] : i32 // CHECK: %[[VAL_435:.*]] = llvm.mlir.constant(31 : i32) : i32 - // CHECK: %[[VAL_436:.*]] = llvm.add %[[VAL_181]], %[[VAL_435]] : i32 + // CHECK: %[[VAL_436:.*]] = llvm.xor %[[CST_0]], %[[VAL_435]] : i32 // CHECK: %[[VAL_437:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_438:.*]] = llvm.add %[[VAL_182]], %[[VAL_437]] : i32 + // CHECK: %[[VAL_438:.*]] = llvm.xor %[[VAL_181]], %[[VAL_437]] : i32 tt.print " x: " {hex = false, isSigned = array} : %cst : tensor<32x32xf16, #dot_operand_a> tt.return } diff --git a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir index 7bfff4fc36..48c9850418 100644 --- a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir @@ -6,46 +6,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @convert_dpas_to_dot_rep_cluster_1_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dot layout and dpas layout are same when the GEMM tiling is clustered as repCluster [1, 2]. - // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16> - // CHECK: %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16> - // CHECK: %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16> - // CHECK: %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16> - // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16> - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16> - // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16> - // CHECK: %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16> - // CHECK: %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16> - // CHECK: %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_233:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16> - // CHECK: %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_249:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16> - // CHECK: %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_265:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16> - // CHECK: %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_281:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16> - // CHECK: %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_297:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16> - // CHECK: %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_313:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16> - // CHECK: %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16> - // CHECK: %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7] - // CHECK: %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15] - // CHECK: %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23] - // CHECK: %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31] - // CHECK: %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39] - // CHECK: %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47] - // CHECK: %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55] - // CHECK: %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63] + // CHECK-NO: llvm.insertvalue + // CHECK-NO: llvm.extractvalue %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>> tt.return } @@ -62,46 +24,135 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 // COM: - 0, 1, 2, 3, 4, 5, 6, 7. // COM: The repetitions order of dot layout when the GEMM tiling is clustered as repCluster [2, 2]: // COM: - 0, 2, 1, 3, 4, 6, 5, 7. - // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16> - // CHECK: %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16> - // CHECK: %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16> - // CHECK: %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16> - // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16> - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16> - // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16> - // CHECK: %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16> - // CHECK: %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16> - // CHECK: %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_233:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16> - // CHECK: %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_249:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16> - // CHECK: %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_265:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16> - // CHECK: %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_281:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16> - // CHECK: %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_297:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16> - // CHECK: %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_313:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16> - // CHECK: %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16> - // CHECK: %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7] - // CHECK: %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15] - // CHECK: %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23] - // CHECK: %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31] - // CHECK: %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39] - // CHECK: %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47] - // CHECK: %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55] - // CHECK: %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63] + // CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] + // CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1] + // CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][2] + // CHECK: %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_0]][3] + // CHECK: %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_0]][4] + // CHECK: %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_0]][5] + // CHECK: %[[VAL_7:.*]] = llvm.extractvalue %[[VAL_0]][6] + // CHECK: %[[VAL_8:.*]] = llvm.extractvalue %[[VAL_0]][7] + // CHECK: %[[VAL_9:.*]] = llvm.extractvalue %[[VAL_0]][8] + // CHECK: %[[VAL_10:.*]] = llvm.extractvalue %[[VAL_0]][9] + // CHECK: %[[VAL_11:.*]] = llvm.extractvalue %[[VAL_0]][10] + // CHECK: %[[VAL_12:.*]] = llvm.extractvalue %[[VAL_0]][11] + // CHECK: %[[VAL_13:.*]] = llvm.extractvalue %[[VAL_0]][12] + // CHECK: %[[VAL_14:.*]] = llvm.extractvalue %[[VAL_0]][13] + // CHECK: %[[VAL_15:.*]] = llvm.extractvalue %[[VAL_0]][14] + // CHECK: %[[VAL_16:.*]] = llvm.extractvalue %[[VAL_0]][15] + // CHECK: %[[VAL_17:.*]] = llvm.extractvalue %[[VAL_0]][16] + // CHECK: %[[VAL_18:.*]] = llvm.extractvalue %[[VAL_0]][17] + // CHECK: %[[VAL_19:.*]] = llvm.extractvalue %[[VAL_0]][18] + // CHECK: %[[VAL_20:.*]] = llvm.extractvalue %[[VAL_0]][19] + // CHECK: %[[VAL_21:.*]] = llvm.extractvalue %[[VAL_0]][20] + // CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_0]][21] + // CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_0]][22] + // CHECK: %[[VAL_24:.*]] = llvm.extractvalue %[[VAL_0]][23] + // CHECK: %[[VAL_25:.*]] = llvm.extractvalue %[[VAL_0]][24] + // CHECK: %[[VAL_26:.*]] = llvm.extractvalue %[[VAL_0]][25] + // CHECK: %[[VAL_27:.*]] = llvm.extractvalue %[[VAL_0]][26] + // CHECK: %[[VAL_28:.*]] = llvm.extractvalue %[[VAL_0]][27] + // CHECK: %[[VAL_29:.*]] = llvm.extractvalue %[[VAL_0]][28] + // CHECK: %[[VAL_30:.*]] = llvm.extractvalue %[[VAL_0]][29] + // CHECK: %[[VAL_31:.*]] = llvm.extractvalue %[[VAL_0]][30] + // CHECK: %[[VAL_32:.*]] = llvm.extractvalue %[[VAL_0]][31] + // CHECK: %[[VAL_33:.*]] = llvm.extractvalue %[[VAL_0]][32] + // CHECK: %[[VAL_34:.*]] = llvm.extractvalue %[[VAL_0]][33] + // CHECK: %[[VAL_35:.*]] = llvm.extractvalue %[[VAL_0]][34] + // CHECK: %[[VAL_36:.*]] = llvm.extractvalue %[[VAL_0]][35] + // CHECK: %[[VAL_37:.*]] = llvm.extractvalue %[[VAL_0]][36] + // CHECK: %[[VAL_38:.*]] = llvm.extractvalue %[[VAL_0]][37] + // CHECK: %[[VAL_39:.*]] = llvm.extractvalue %[[VAL_0]][38] + // CHECK: %[[VAL_40:.*]] = llvm.extractvalue %[[VAL_0]][39] + // CHECK: %[[VAL_41:.*]] = llvm.extractvalue %[[VAL_0]][40] + // CHECK: %[[VAL_42:.*]] = llvm.extractvalue %[[VAL_0]][41] + // CHECK: %[[VAL_43:.*]] = llvm.extractvalue %[[VAL_0]][42] + // CHECK: %[[VAL_44:.*]] = llvm.extractvalue %[[VAL_0]][43] + // CHECK: %[[VAL_45:.*]] = llvm.extractvalue %[[VAL_0]][44] + // CHECK: %[[VAL_46:.*]] = llvm.extractvalue %[[VAL_0]][45] + // CHECK: %[[VAL_47:.*]] = llvm.extractvalue %[[VAL_0]][46] + // CHECK: %[[VAL_48:.*]] = llvm.extractvalue %[[VAL_0]][47] + // CHECK: %[[VAL_49:.*]] = llvm.extractvalue %[[VAL_0]][48] + // CHECK: %[[VAL_50:.*]] = llvm.extractvalue %[[VAL_0]][49] + // CHECK: %[[VAL_51:.*]] = llvm.extractvalue %[[VAL_0]][50] + // CHECK: %[[VAL_52:.*]] = llvm.extractvalue %[[VAL_0]][51] + // CHECK: %[[VAL_53:.*]] = llvm.extractvalue %[[VAL_0]][52] + // CHECK: %[[VAL_54:.*]] = llvm.extractvalue %[[VAL_0]][53] + // CHECK: %[[VAL_55:.*]] = llvm.extractvalue %[[VAL_0]][54] + // CHECK: %[[VAL_56:.*]] = llvm.extractvalue %[[VAL_0]][55] + // CHECK: %[[VAL_57:.*]] = llvm.extractvalue %[[VAL_0]][56] + // CHECK: %[[VAL_58:.*]] = llvm.extractvalue %[[VAL_0]][57] + // CHECK: %[[VAL_59:.*]] = llvm.extractvalue %[[VAL_0]][58] + // CHECK: %[[VAL_60:.*]] = llvm.extractvalue %[[VAL_0]][59] + // CHECK: %[[VAL_61:.*]] = llvm.extractvalue %[[VAL_0]][60] + // CHECK: %[[VAL_62:.*]] = llvm.extractvalue %[[VAL_0]][61] + // CHECK: %[[VAL_63:.*]] = llvm.extractvalue %[[VAL_0]][62] + // CHECK: %[[VAL_64:.*]] = llvm.extractvalue %[[VAL_0]][63] + // CHECK: %[[VAL_65:.*]] = llvm.mlir.undef + // CHECK: %[[VAL_66:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_65]][0] + // CHECK: %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_66]][1] + // CHECK: %[[VAL_68:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_67]][2] + // CHECK: %[[VAL_69:.*]] = llvm.insertvalue %[[VAL_4]], %[[VAL_68]][3] + // CHECK: %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_5]], %[[VAL_69]][4] + // CHECK: %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_70]][5] + // CHECK: %[[VAL_72:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_71]][6] + // CHECK: %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_8]], %[[VAL_72]][7] + // CHECK: %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_17]], %[[VAL_73]][8] + // CHECK: %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_74]][9] + // CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_75]][10] + // CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_20]], %[[VAL_76]][11] + // CHECK: %[[VAL_78:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_77]][12] + // CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_22]], %[[VAL_78]][13] + // CHECK: %[[VAL_80:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_79]][14] + // CHECK: %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_24]], %[[VAL_80]][15] + // CHECK: %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_81]][16] + // CHECK: %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_10]], %[[VAL_82]][17] + // CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_83]][18] + // CHECK: %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_12]], %[[VAL_84]][19] + // CHECK: %[[VAL_86:.*]] = llvm.insertvalue %[[VAL_13]], %[[VAL_85]][20] + // CHECK: %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_14]], %[[VAL_86]][21] + // CHECK: %[[VAL_88:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_87]][22] + // CHECK: %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_88]][23] + // CHECK: %[[VAL_90:.*]] = llvm.insertvalue %[[VAL_25]], %[[VAL_89]][24] + // CHECK: %[[VAL_91:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_90]][25] + // CHECK: %[[VAL_92:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_91]][26] + // CHECK: %[[VAL_93:.*]] = llvm.insertvalue %[[VAL_28]], %[[VAL_92]][27] + // CHECK: %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_93]][28] + // CHECK: %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_94]][29] + // CHECK: %[[VAL_96:.*]] = llvm.insertvalue %[[VAL_31]], %[[VAL_95]][30] + // CHECK: %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_32]], %[[VAL_96]][31] + // CHECK: %[[VAL_98:.*]] = llvm.insertvalue %[[VAL_33]], %[[VAL_97]][32] + // CHECK: %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_98]][33] + // CHECK: %[[VAL_100:.*]] = llvm.insertvalue %[[VAL_35]], %[[VAL_99]][34] + // CHECK: %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_100]][35] + // CHECK: %[[VAL_102:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_101]][36] + // CHECK: %[[VAL_103:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_102]][37] + // CHECK: %[[VAL_104:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_103]][38] + // CHECK: %[[VAL_105:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_104]][39] + // CHECK: %[[VAL_106:.*]] = llvm.insertvalue %[[VAL_49]], %[[VAL_105]][40] + // CHECK: %[[VAL_107:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_106]][41] + // CHECK: %[[VAL_108:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_107]][42] + // CHECK: %[[VAL_109:.*]] = llvm.insertvalue %[[VAL_52]], %[[VAL_108]][43] + // CHECK: %[[VAL_110:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_109]][44] + // CHECK: %[[VAL_111:.*]] = llvm.insertvalue %[[VAL_54]], %[[VAL_110]][45] + // CHECK: %[[VAL_112:.*]] = llvm.insertvalue %[[VAL_55]], %[[VAL_111]][46] + // CHECK: %[[VAL_113:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_112]][47] + // CHECK: %[[VAL_114:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_113]][48] + // CHECK: %[[VAL_115:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_114]][49] + // CHECK: %[[VAL_116:.*]] = llvm.insertvalue %[[VAL_43]], %[[VAL_115]][50] + // CHECK: %[[VAL_117:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_116]][51] + // CHECK: %[[VAL_118:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_117]][52] + // CHECK: %[[VAL_119:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_118]][53] + // CHECK: %[[VAL_120:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_119]][54] + // CHECK: %[[VAL_121:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_120]][55] + // CHECK: %[[VAL_122:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_121]][56] + // CHECK: %[[VAL_123:.*]] = llvm.insertvalue %[[VAL_58]], %[[VAL_122]][57] + // CHECK: %[[VAL_124:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_123]][58] + // CHECK: %[[VAL_125:.*]] = llvm.insertvalue %[[VAL_60]], %[[VAL_124]][59] + // CHECK: %[[VAL_126:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_125]][60] + // CHECK: %[[VAL_127:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_126]][61] + // CHECK: %[[VAL_128:.*]] = llvm.insertvalue %[[VAL_63]], %[[VAL_127]][62] + // CHECK: %[[VAL_129:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_128]][63] %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>> tt.return } @@ -118,46 +169,135 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 // COM: - 0, 1, 2, 3, 4, 5, 6, 7. // COM: The repetitions order of dot layout when the GEMM tiling is clustered as repCluster [4, 2]: // COM: - 0, 2, 4, 6, 1, 3, 5, 7. - // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16> - // CHECK: %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16> - // CHECK: %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16> - // CHECK: %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16> - // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16> - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16> - // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16> - // CHECK: %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16> - // CHECK: %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16> - // CHECK: %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_233:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16> - // CHECK: %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_249:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16> - // CHECK: %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_265:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16> - // CHECK: %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_281:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16> - // CHECK: %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_297:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16> - // CHECK: %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_313:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16> - // CHECK: %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16> - // CHECK: %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7] - // CHECK: %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15] - // CHECK: %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23] - // CHECK: %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31] - // CHECK: %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39] - // CHECK: %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47] - // CHECK: %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55] - // CHECK: %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63] + // CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] + // CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1] + // CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][2] + // CHECK: %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_0]][3] + // CHECK: %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_0]][4] + // CHECK: %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_0]][5] + // CHECK: %[[VAL_7:.*]] = llvm.extractvalue %[[VAL_0]][6] + // CHECK: %[[VAL_8:.*]] = llvm.extractvalue %[[VAL_0]][7] + // CHECK: %[[VAL_9:.*]] = llvm.extractvalue %[[VAL_0]][8] + // CHECK: %[[VAL_10:.*]] = llvm.extractvalue %[[VAL_0]][9] + // CHECK: %[[VAL_11:.*]] = llvm.extractvalue %[[VAL_0]][10] + // CHECK: %[[VAL_12:.*]] = llvm.extractvalue %[[VAL_0]][11] + // CHECK: %[[VAL_13:.*]] = llvm.extractvalue %[[VAL_0]][12] + // CHECK: %[[VAL_14:.*]] = llvm.extractvalue %[[VAL_0]][13] + // CHECK: %[[VAL_15:.*]] = llvm.extractvalue %[[VAL_0]][14] + // CHECK: %[[VAL_16:.*]] = llvm.extractvalue %[[VAL_0]][15] + // CHECK: %[[VAL_17:.*]] = llvm.extractvalue %[[VAL_0]][16] + // CHECK: %[[VAL_18:.*]] = llvm.extractvalue %[[VAL_0]][17] + // CHECK: %[[VAL_19:.*]] = llvm.extractvalue %[[VAL_0]][18] + // CHECK: %[[VAL_20:.*]] = llvm.extractvalue %[[VAL_0]][19] + // CHECK: %[[VAL_21:.*]] = llvm.extractvalue %[[VAL_0]][20] + // CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_0]][21] + // CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_0]][22] + // CHECK: %[[VAL_24:.*]] = llvm.extractvalue %[[VAL_0]][23] + // CHECK: %[[VAL_25:.*]] = llvm.extractvalue %[[VAL_0]][24] + // CHECK: %[[VAL_26:.*]] = llvm.extractvalue %[[VAL_0]][25] + // CHECK: %[[VAL_27:.*]] = llvm.extractvalue %[[VAL_0]][26] + // CHECK: %[[VAL_28:.*]] = llvm.extractvalue %[[VAL_0]][27] + // CHECK: %[[VAL_29:.*]] = llvm.extractvalue %[[VAL_0]][28] + // CHECK: %[[VAL_30:.*]] = llvm.extractvalue %[[VAL_0]][29] + // CHECK: %[[VAL_31:.*]] = llvm.extractvalue %[[VAL_0]][30] + // CHECK: %[[VAL_32:.*]] = llvm.extractvalue %[[VAL_0]][31] + // CHECK: %[[VAL_33:.*]] = llvm.extractvalue %[[VAL_0]][32] + // CHECK: %[[VAL_34:.*]] = llvm.extractvalue %[[VAL_0]][33] + // CHECK: %[[VAL_35:.*]] = llvm.extractvalue %[[VAL_0]][34] + // CHECK: %[[VAL_36:.*]] = llvm.extractvalue %[[VAL_0]][35] + // CHECK: %[[VAL_37:.*]] = llvm.extractvalue %[[VAL_0]][36] + // CHECK: %[[VAL_38:.*]] = llvm.extractvalue %[[VAL_0]][37] + // CHECK: %[[VAL_39:.*]] = llvm.extractvalue %[[VAL_0]][38] + // CHECK: %[[VAL_40:.*]] = llvm.extractvalue %[[VAL_0]][39] + // CHECK: %[[VAL_41:.*]] = llvm.extractvalue %[[VAL_0]][40] + // CHECK: %[[VAL_42:.*]] = llvm.extractvalue %[[VAL_0]][41] + // CHECK: %[[VAL_43:.*]] = llvm.extractvalue %[[VAL_0]][42] + // CHECK: %[[VAL_44:.*]] = llvm.extractvalue %[[VAL_0]][43] + // CHECK: %[[VAL_45:.*]] = llvm.extractvalue %[[VAL_0]][44] + // CHECK: %[[VAL_46:.*]] = llvm.extractvalue %[[VAL_0]][45] + // CHECK: %[[VAL_47:.*]] = llvm.extractvalue %[[VAL_0]][46] + // CHECK: %[[VAL_48:.*]] = llvm.extractvalue %[[VAL_0]][47] + // CHECK: %[[VAL_49:.*]] = llvm.extractvalue %[[VAL_0]][48] + // CHECK: %[[VAL_50:.*]] = llvm.extractvalue %[[VAL_0]][49] + // CHECK: %[[VAL_51:.*]] = llvm.extractvalue %[[VAL_0]][50] + // CHECK: %[[VAL_52:.*]] = llvm.extractvalue %[[VAL_0]][51] + // CHECK: %[[VAL_53:.*]] = llvm.extractvalue %[[VAL_0]][52] + // CHECK: %[[VAL_54:.*]] = llvm.extractvalue %[[VAL_0]][53] + // CHECK: %[[VAL_55:.*]] = llvm.extractvalue %[[VAL_0]][54] + // CHECK: %[[VAL_56:.*]] = llvm.extractvalue %[[VAL_0]][55] + // CHECK: %[[VAL_57:.*]] = llvm.extractvalue %[[VAL_0]][56] + // CHECK: %[[VAL_58:.*]] = llvm.extractvalue %[[VAL_0]][57] + // CHECK: %[[VAL_59:.*]] = llvm.extractvalue %[[VAL_0]][58] + // CHECK: %[[VAL_60:.*]] = llvm.extractvalue %[[VAL_0]][59] + // CHECK: %[[VAL_61:.*]] = llvm.extractvalue %[[VAL_0]][60] + // CHECK: %[[VAL_62:.*]] = llvm.extractvalue %[[VAL_0]][61] + // CHECK: %[[VAL_63:.*]] = llvm.extractvalue %[[VAL_0]][62] + // CHECK: %[[VAL_64:.*]] = llvm.extractvalue %[[VAL_0]][63] + // CHECK: %[[VAL_65:.*]] = llvm.mlir.undef + // CHECK: %[[VAL_66:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_65]][0] + // CHECK: %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_66]][1] + // CHECK: %[[VAL_68:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_67]][2] + // CHECK: %[[VAL_69:.*]] = llvm.insertvalue %[[VAL_4]], %[[VAL_68]][3] + // CHECK: %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_5]], %[[VAL_69]][4] + // CHECK: %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_70]][5] + // CHECK: %[[VAL_72:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_71]][6] + // CHECK: %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_8]], %[[VAL_72]][7] + // CHECK: %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_17]], %[[VAL_73]][8] + // CHECK: %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_74]][9] + // CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_75]][10] + // CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_20]], %[[VAL_76]][11] + // CHECK: %[[VAL_78:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_77]][12] + // CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_22]], %[[VAL_78]][13] + // CHECK: %[[VAL_80:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_79]][14] + // CHECK: %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_24]], %[[VAL_80]][15] + // CHECK: %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_33]], %[[VAL_81]][16] + // CHECK: %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_82]][17] + // CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_35]], %[[VAL_83]][18] + // CHECK: %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_84]][19] + // CHECK: %[[VAL_86:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_85]][20] + // CHECK: %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_86]][21] + // CHECK: %[[VAL_88:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_87]][22] + // CHECK: %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_88]][23] + // CHECK: %[[VAL_90:.*]] = llvm.insertvalue %[[VAL_49]], %[[VAL_89]][24] + // CHECK: %[[VAL_91:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_90]][25] + // CHECK: %[[VAL_92:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_91]][26] + // CHECK: %[[VAL_93:.*]] = llvm.insertvalue %[[VAL_52]], %[[VAL_92]][27] + // CHECK: %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_93]][28] + // CHECK: %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_54]], %[[VAL_94]][29] + // CHECK: %[[VAL_96:.*]] = llvm.insertvalue %[[VAL_55]], %[[VAL_95]][30] + // CHECK: %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_96]][31] + // CHECK: %[[VAL_98:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_97]][32] + // CHECK: %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_10]], %[[VAL_98]][33] + // CHECK: %[[VAL_100:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_99]][34] + // CHECK: %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_12]], %[[VAL_100]][35] + // CHECK: %[[VAL_102:.*]] = llvm.insertvalue %[[VAL_13]], %[[VAL_101]][36] + // CHECK: %[[VAL_103:.*]] = llvm.insertvalue %[[VAL_14]], %[[VAL_102]][37] + // CHECK: %[[VAL_104:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_103]][38] + // CHECK: %[[VAL_105:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_104]][39] + // CHECK: %[[VAL_106:.*]] = llvm.insertvalue %[[VAL_25]], %[[VAL_105]][40] + // CHECK: %[[VAL_107:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_106]][41] + // CHECK: %[[VAL_108:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_107]][42] + // CHECK: %[[VAL_109:.*]] = llvm.insertvalue %[[VAL_28]], %[[VAL_108]][43] + // CHECK: %[[VAL_110:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_109]][44] + // CHECK: %[[VAL_111:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_110]][45] + // CHECK: %[[VAL_112:.*]] = llvm.insertvalue %[[VAL_31]], %[[VAL_111]][46] + // CHECK: %[[VAL_113:.*]] = llvm.insertvalue %[[VAL_32]], %[[VAL_112]][47] + // CHECK: %[[VAL_114:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_113]][48] + // CHECK: %[[VAL_115:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_114]][49] + // CHECK: %[[VAL_116:.*]] = llvm.insertvalue %[[VAL_43]], %[[VAL_115]][50] + // CHECK: %[[VAL_117:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_116]][51] + // CHECK: %[[VAL_118:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_117]][52] + // CHECK: %[[VAL_119:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_118]][53] + // CHECK: %[[VAL_120:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_119]][54] + // CHECK: %[[VAL_121:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_120]][55] + // CHECK: %[[VAL_122:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_121]][56] + // CHECK: %[[VAL_123:.*]] = llvm.insertvalue %[[VAL_58]], %[[VAL_122]][57] + // CHECK: %[[VAL_124:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_123]][58] + // CHECK: %[[VAL_125:.*]] = llvm.insertvalue %[[VAL_60]], %[[VAL_124]][59] + // CHECK: %[[VAL_126:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_125]][60] + // CHECK: %[[VAL_127:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_126]][61] + // CHECK: %[[VAL_128:.*]] = llvm.insertvalue %[[VAL_63]], %[[VAL_127]][62] + // CHECK: %[[VAL_129:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_128]][63] %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>> tt.return } diff --git a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir index 2bb504d76f..1ecb0a5a2c 100644 --- a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir +++ b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir @@ -69,14 +69,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK-DAG: %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 // CHECK-DAG: %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32 // CHECK-DAG: %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32 // CHECK-DAG: %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 // CHECK-DAG: %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32 // CHECK-DAG: %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32 // CHECK-DAG: %[[CST_19:.*]] = llvm.mlir.constant(19 : i32) : i32 @@ -86,43 +85,46 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-DAG: %[[CST_23:.*]] = llvm.mlir.constant(23 : i32) : i32 // CHECK: %[[THREAD_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) // CHECK: %[[THREAD_ID_32:.*]] = llvm.trunc %[[THREAD_ID]] : i64 to i32 - // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_32]], %[[CST_16]] : i32 // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_32]], %[[CST_16]] : i32 - // CHECK: %[[VAL_29:.*]] = llvm.udiv %[[WARP_ID]], %[[CST_2]] : i32 - // CHECK: %[[WARP_ID_X:.*]] = llvm.urem %[[VAL_29]], %[[CST_2]] : i32 - // CHECK: %[[ROUNDED_WARP_ID_X:.*]] = llvm.urem %[[WARP_ID_X]], %[[CST_4]] : i32 - // CHECK: %[[WARP_OFFSET:.*]] = llvm.mul %[[ROUNDED_WARP_ID_X]], %[[CST_8]] : i32 - // CHECK: %[[LANE_ID_X:.*]] = llvm.udiv %[[LANE_ID]], %[[CST_16]] : i32 - // CHECK: %[[LANE_ID_Y:.*]] = llvm.urem %[[LANE_ID]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_Y:.*]] = llvm.mul %[[LANE_ID_Y]], %[[CST_2]] : i32 - // CHECK: %[[OFFSET_x:.*]] = llvm.add %[[LANE_ID_X]], %[[WARP_OFFSET]] : i32 - // CHECK: %[[VAL_37:.*]] = llvm.urem %[[CST_0]], %[[CST_1]] : i32 - // CHECK: %[[VAL_38:.*]] = llvm.udiv %[[CST_0]], %[[CST_1]] : i32 - // CHECK: %[[VAL_39:.*]] = llvm.urem %[[VAL_38]], %[[CST_1]] : i32 - // CHECK: %[[VAL_40:.*]] = llvm.urem %[[VAL_39]], %[[CST_1]] : i32 - // CHECK: %[[VAL_41:.*]] = llvm.urem %[[VAL_37]], %[[CST_1]] : i32 - // CHECK: %[[CTA_OFFSET_X:.*]] = llvm.mul %[[VAL_40]], %[[CST_32]] : i32 - // CHECK: %[[CTA_OFFSET_Y:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32 - // CHECK: %[[VAL_44:.*]] = llvm.add %[[OFFSET_x]], %[[CTA_OFFSET_X]] : i32 - // CHECK: %[[VAL_45:.*]] = llvm.add %[[OFFSET_Y]], %[[CTA_OFFSET_Y]] : i32 - // CHECK: %[[OFFSET_X_0:.*]] = llvm.add %[[VAL_44]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_Y_0:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_Y_1:.*]] = llvm.add %[[VAL_45]], %[[CST_1]] : i32 - // CHECK: %[[OFFSET_X_1:.*]] = llvm.add %[[VAL_44]], %[[CST_1]] : i32 - // CHECK: %[[OFFSET_X_2:.*]] = llvm.add %[[VAL_44]], %[[CST_2]] : i32 - // CHECK: %[[OFFSET_X_3:.*]] = llvm.add %[[VAL_44]], %[[CST_3]] : i32 - // CHECK: %[[OFFSET_X_4:.*]] = llvm.add %[[VAL_44]], %[[CST_4]] : i32 - // CHECK: %[[OFFSET_X_5:.*]] = llvm.add %[[VAL_44]], %[[CST_5]] : i32 - // CHECK: %[[OFFSET_X_6:.*]] = llvm.add %[[VAL_44]], %[[CST_6]] : i32 - // CHECK: %[[OFFSET_X_7:.*]] = llvm.add %[[VAL_44]], %[[CST_7]] : i32 - // CHECK: %[[OFFSET_X_8:.*]] = llvm.add %[[VAL_44]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_X_9:.*]] = llvm.add %[[VAL_44]], %[[CST_17]] : i32 - // CHECK: %[[OFFSET_X_10:.*]] = llvm.add %[[VAL_44]], %[[CST_18]] : i32 - // CHECK: %[[OFFSET_X_11:.*]] = llvm.add %[[VAL_44]], %[[CST_19]] : i32 - // CHECK: %[[OFFSET_X_12:.*]] = llvm.add %[[VAL_44]], %[[CST_20]] : i32 - // CHECK: %[[OFFSET_X_13:.*]] = llvm.add %[[VAL_44]], %[[CST_21]] : i32 - // CHECK: %[[OFFSET_X_14:.*]] = llvm.add %[[VAL_44]], %[[CST_22]] : i32 - // CHECK: %[[OFFSET_X_15:.*]] = llvm.add %[[VAL_44]], %[[CST_23]] : i32 + // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_32]], %[[CST_16]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.and %[[LANE_ID]], %[[CST_1]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.icmp "eq" %[[VAL_27]], %[[CST_0]] : i32 + // CHECK: %[[VAL_29:.*]] = llvm.select %[[VAL_28]], %[[CST_0]], %[[CST_2]] : i1, i32 + // CHECK: %[[VAL_30:.*]] = llvm.xor %[[CST_0]], %[[VAL_29]] : i32 + // CHECK: %[[VAL_31:.*]] = llvm.and %[[LANE_ID]], %[[CST_2]] : i32 + // CHECK: %[[VAL_32:.*]] = llvm.icmp "eq" %[[VAL_31]], %[[CST_0]] : i32 + // CHECK: %[[VAL_33:.*]] = llvm.select %[[VAL_32]], %[[CST_0]], %[[CST_4]] : i1, i32 + // CHECK: %[[VAL_34:.*]] = llvm.xor %[[VAL_30]], %[[VAL_33]] : i32 + // CHECK: %[[VAL_35:.*]] = llvm.and %[[LANE_ID]], %[[CST_4]] : i32 + // CHECK: %[[VAL_36:.*]] = llvm.icmp "eq" %[[VAL_35]], %[[CST_0]] : i32 + // CHECK: %[[VAL_37:.*]] = llvm.select %[[VAL_36]], %[[CST_0]], %[[CST_8]] : i1, i32 + // CHECK: %[[VAL_38:.*]] = llvm.xor %[[VAL_34]], %[[VAL_37]] : i32 + // CHECK: %[[VAL_39:.*]] = llvm.and %[[LANE_ID]], %[[CST_8]] : i32 + // CHECK: %[[VAL_40:.*]] = llvm.icmp "eq" %[[VAL_39]], %[[CST_0]] : i32 + // CHECK: %[[VAL_41:.*]] = llvm.select %[[VAL_40]], %[[CST_0]], %[[CST_16]] : i1, i32 + // CHECK: %[[VAL_42:.*]] = llvm.xor %[[VAL_38]], %[[VAL_41]] : i32 + // CHECK: %[[VAL_43:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32 + // CHECK: %[[VAL_44:.*]] = llvm.icmp "eq" %[[VAL_43]], %[[CST_0]] : i32 + // CHECK: %[[VAL_45:.*]] = llvm.select %[[VAL_44]], %[[CST_0]], %[[CST_8]] : i1, i32 + // CHECK: %[[VAL_46:.*]] = llvm.xor %[[CST_0]], %[[VAL_45]] : i32 + // CHECK: %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_46]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_Y_0:.*]] = llvm.xor %[[VAL_42]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_Y_1:.*]] = llvm.xor %[[VAL_42]], %[[CST_1]] : i32 + // CHECK: %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_46]], %[[CST_1]] : i32 + // CHECK: %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_46]], %[[CST_2]] : i32 + // CHECK: %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_46]], %[[CST_3]] : i32 + // CHECK: %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_46]], %[[CST_4]] : i32 + // CHECK: %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_46]], %[[CST_5]] : i32 + // CHECK: %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_46]], %[[CST_6]] : i32 + // CHECK: %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_46]], %[[CST_7]] : i32 + // CHECK: %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_46]], %[[CST_16]] : i32 + // CHECK: %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_46]], %[[CST_17]] : i32 + // CHECK: %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_46]], %[[CST_18]] : i32 + // CHECK: %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_46]], %[[CST_19]] : i32 + // CHECK: %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_46]], %[[CST_20]] : i32 + // CHECK: %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_46]], %[[CST_21]] : i32 + // CHECK: %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_46]], %[[CST_22]] : i32 + // CHECK: %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_46]], %[[CST_23]] : i32 // CHECK: llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], %[[OFFSET_Y_0]], {{.*}}, {{.*}}) // CHECK: llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], %[[OFFSET_Y_1]], {{.*}}, {{.*}}) // CHECK: llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], %[[OFFSET_Y_0]], {{.*}}, {{.*}}) @@ -172,14 +174,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK-DAG: %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 // CHECK-DAG: %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32 // CHECK-DAG: %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32 // CHECK-DAG: %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 // CHECK-DAG: %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32 // CHECK-DAG: %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32 // CHECK-DAG: %[[CST_19:.*]] = llvm.mlir.constant(19 : i32) : i32 @@ -190,34 +191,26 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[THREADS_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) // CHECK: %[[THREADS_ID_32:.*]] = llvm.trunc %[[THREADS_ID]] : i64 to i32 // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREADS_ID_32]], %[[CST_16]] : i32 - // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREADS_ID_32]], %[[CST_16]] : i32 - // CHECK: %[[VAL_29:.*]] = llvm.udiv %[[WARP_ID]], %[[CST_2]] : i32 - // CHECK: %[[WARP_ID_X:.*]] = llvm.urem %[[VAL_29]], %[[CST_2]] : i32 - // CHECK: %[[ROUNDED_WARP_ID_X:.*]] = llvm.urem %[[WARP_ID_X]], %[[CST_4]] : i32 - // CHECK: %[[WARP_OFFSET_X:.*]] = llvm.mul %[[ROUNDED_WARP_ID_X]], %[[CST_8]] : i32 - // CHECK: %[[LANE_OFFSET_X:.*]] = llvm.udiv %[[LANE_ID]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_X:.*]] = llvm.add %[[LANE_OFFSET_X]], %[[WARP_OFFSET_X]] : i32 - // CHECK: %[[VAL_35:.*]] = llvm.udiv %[[CST_0]], %[[CST_1]] : i32 - // CHECK: %[[VAL_36:.*]] = llvm.urem %[[VAL_35]], %[[CST_1]] : i32 - // CHECK: %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_1]] : i32 - // CHECK: %[[CTA_OFFSET_X:.*]] = llvm.mul %[[VAL_37]], %[[CST_32]] : i32 - // CHECK: %[[VAL_39:.*]] = llvm.add %[[OFFSET_X]], %[[CTA_OFFSET_X]] : i32 - // CHECK: %[[OFFSET_X_0:.*]] = llvm.add %[[VAL_39]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_X_1:.*]] = llvm.add %[[VAL_39]], %[[CST_1]] : i32 - // CHECK: %[[OFFSET_X_2:.*]] = llvm.add %[[VAL_39]], %[[CST_2]] : i32 - // CHECK: %[[OFFSET_X_3:.*]] = llvm.add %[[VAL_39]], %[[CST_3]] : i32 - // CHECK: %[[OFFSET_X_4:.*]] = llvm.add %[[VAL_39]], %[[CST_4]] : i32 - // CHECK: %[[OFFSET_X_5:.*]] = llvm.add %[[VAL_39]], %[[CST_5]] : i32 - // CHECK: %[[OFFSET_X_6:.*]] = llvm.add %[[VAL_39]], %[[CST_6]] : i32 - // CHECK: %[[OFFSET_X_7:.*]] = llvm.add %[[VAL_39]], %[[CST_7]] : i32 - // CHECK: %[[OFFSET_X_8:.*]] = llvm.add %[[VAL_39]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_X_9:.*]] = llvm.add %[[VAL_39]], %[[CST_17]] : i32 - // CHECK: %[[OFFSET_X_10:.*]] = llvm.add %[[VAL_39]], %[[CST_18]] : i32 - // CHECK: %[[OFFSET_X_11:.*]] = llvm.add %[[VAL_39]], %[[CST_19]] : i32 - // CHECK: %[[OFFSET_X_12:.*]] = llvm.add %[[VAL_39]], %[[CST_20]] : i32 - // CHECK: %[[OFFSET_X_13:.*]] = llvm.add %[[VAL_39]], %[[CST_21]] : i32 - // CHECK: %[[OFFSET_X_14:.*]] = llvm.add %[[VAL_39]], %[[CST_22]] : i32 - // CHECK: %[[OFFSET_X_15:.*]] = llvm.add %[[VAL_39]], %[[CST_23]] : i32 + // CHECK: %[[VAL_26:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.icmp "eq" %[[VAL_26]], %[[CST_0]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.select %[[VAL_27]], %[[CST_0]], %[[CST_8]] : i1, i32 + // CHECK: %[[VAL_29:.*]] = llvm.xor %[[CST_0]], %[[VAL_28]] : i32 + // CHECK: %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_29]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_29]], %[[CST_1]] : i32 + // CHECK: %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_29]], %[[CST_2]] : i32 + // CHECK: %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_29]], %[[CST_3]] : i32 + // CHECK: %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_29]], %[[CST_4]] : i32 + // CHECK: %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_29]], %[[CST_5]] : i32 + // CHECK: %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_29]], %[[CST_6]] : i32 + // CHECK: %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_29]], %[[CST_7]] : i32 + // CHECK: %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_29]], %[[CST_16]] : i32 + // CHECK: %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_29]], %[[CST_17]] : i32 + // CHECK: %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_29]], %[[CST_18]] : i32 + // CHECK: %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_29]], %[[CST_19]] : i32 + // CHECK: %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_29]], %[[CST_20]] : i32 + // CHECK: %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_29]], %[[CST_21]] : i32 + // CHECK: %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_29]], %[[CST_22]] : i32 + // CHECK: %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_29]], %[[CST_23]] : i32 // CHECK: %[[VAL_56:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], {{.*}}, {{.*}}) // CHECK: %[[VAL_57:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], {{.*}}, {{.*}}) // CHECK: %[[VAL_58:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_2]], {{.*}}, {{.*}}) diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp index 4ee77e934d..6b902003fb 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp @@ -341,38 +341,44 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout, } // anonymous namespace +// clang-format off // The layout example repeat_count=8, systolic_depth=8, // execution_size=16 and operands_per_chan=2 for warp size 32. // For A operand: -// systolic depth = 8 -//<-----------------------------------------------------> -// opsPerChan=2 -//<---------> -// t0 ... t0 t1 ... t1 ~ t6 ... t6 t7 ... t7 ^ -// t8 ... t8 t9 ... t9 ~ t14 ... t14 t15 ... t15 | -// t16 ... t16 t17 ... t17 ~ t22 ... t22 t23 ... t23 | -// t24 ... t24 t25 ... t25 ~ t30 ... t30 t31 ... t31 | repeat count <= 8 -// t0 ... t0 t1 ... t1 ~ t6 ... t6 t7 ... t7 | -// t8 ... t8 t9 ... t9 ~ t14 ... t14 t15 ... t15 | -// t16 ... t16 t17 ... t17 ~ t22 ... t22 t23 ... t23 | -// t24 ... t24 t25 ... t25 ~ t30 ... t30 t31 ... t31 v +// K = 16 (K = systolic depth * opsPerChan) +// <----------------------------------------------------------------------------> +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 ^ +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 | +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 | +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 | +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 | M = 8 (repeat count) +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 | +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 | +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 v // In this case, the LinearLayout bases are: -// Register: {{0,1}, {4,0}} -// Lane: {{0,2}, {0,4}, {0,8}, {1,0}, {2,0}} +// Register: {{2,0}, {4,0}} +// Lane: {{0,1}, {0,2}, {0,4}, {0,8}, {1,0}} +// clang-format on std::vector> DPASRegBasesA(int opsPerChannel, int repeatCount, int threadsPerWarp, int systolicDepth) { - int rowPerWarp = threadsPerWarp / systolicDepth; - int warpRepeats = repeatCount / rowPerWarp; std::vector> regBases; - for (int opc = 1; opc < opsPerChannel; opc *= 2) { + // pack the value to i16 for scalar bit width <=16. + assert((opsPerChannel == 4 || opsPerChannel == 2 || opsPerChannel == 1) && + "invalid opsPerChannel number."); + int packedOpsPerLane = opsPerChannel == 4 ? 2 : 1; + int packedColNum = (systolicDepth * opsPerChannel) / packedOpsPerLane; + int rowsPerWarp = mlir::ceil(threadsPerWarp, packedColNum); + int warpRepeats = repeatCount / rowsPerWarp; + + for (int opc = 1; opc < packedOpsPerLane; opc *= 2) { regBases.push_back({0, opc}); } for (int warp = 1; warp < warpRepeats; warp *= 2) { - regBases.push_back({warp * rowPerWarp, 0}); + regBases.push_back({warp * rowsPerWarp, 0}); } return regBases; @@ -382,11 +388,17 @@ std::vector> DPASLaneBasesA(int opsPerChannel, int threadsPerWarp, int systolicDepth) { std::vector> laneBases; - for (int tid = 1; tid < systolicDepth; tid *= 2) { - laneBases.push_back({0, opsPerChannel * tid}); + // pack the value to i16 for scalar bit width <=16. + assert((opsPerChannel == 4 || opsPerChannel == 2 || opsPerChannel == 1) && + "invalid opsPerChannel number."); + int packedOpsPerLane = opsPerChannel == 4 ? 2 : 1; + int packedColNum = (systolicDepth * opsPerChannel) / packedOpsPerLane; + + for (int tid = 1; tid < packedColNum; tid *= 2) { + laneBases.push_back({0, packedOpsPerLane * tid}); } - for (int tid = systolicDepth; tid < threadsPerWarp; tid *= 2) { - laneBases.push_back({tid / systolicDepth, 0}); + for (int tid = packedColNum; tid < threadsPerWarp; tid *= 2) { + laneBases.push_back({tid / packedColNum, 0}); } return laneBases; @@ -602,8 +614,7 @@ std::optional dotOperandDpasToLinearLayout(DotOperandEncodingAttr dotDpasLayout, ArrayRef shape) { auto dpasLayout = cast(dotDpasLayout.getParent()); - if (dotDpasLayout.getOpIdx() == 0) - return std::nullopt; + return DPAStoLinearLayout(shape, dpasLayout, dotDpasLayout.getOpIdx()); } diff --git a/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp b/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp index 6d42c9948a..d4f6d0b821 100644 --- a/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp +++ b/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp @@ -59,17 +59,47 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) { }, {S("dim0"), S("dim1")})); // Test Operand A (opIdx=0) + EXPECT_EQ( + DPAStoLinearLayout({8, 32}, dpas({1, 1}, 8, 8, 16, 4, {1, 1}, 32), 0), + LinearLayout( + { + {S("register"), {{0, 1}, {2, 0}, {4, 0}}}, + {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {0, 16}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); EXPECT_EQ( DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32), 0), LinearLayout( { - {S("register"), {{0, 1}, {4, 0}}}, - {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}, + {S("register"), {{2, 0}, {4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ( + DPAStoLinearLayout({8, 8}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 32), 0), + LinearLayout( + { + {S("register"), {{4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {1, 0}, {2, 0}}}, {S("warp"), {}}, {S("block"), {}}, }, {S("dim0"), S("dim1")})); // Test Operand B (opIdx=1) + EXPECT_EQ( + DPAStoLinearLayout({32, 16}, dpas({1, 1}, 8, 8, 16, 4, {1, 1}, 32), 1), + LinearLayout( + { + {S("register"), {{1, 0}, {2, 0}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {4, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); EXPECT_EQ( DPAStoLinearLayout({16, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32), 1), LinearLayout( @@ -80,6 +110,16 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) { {S("block"), {}}, }, {S("dim0"), S("dim1")})); + EXPECT_EQ( + DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 32), 1), + LinearLayout( + { + {S("register"), {{2, 0}, {4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); } TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) { @@ -98,8 +138,8 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) { DPAStoLinearLayout({32, 16}, dpas({1, 1}, 8, 8, 16, 2, {4, 2}, 32), 0), LinearLayout( { - {S("register"), {{0, 1}, {4, 0}, {8, 0}, {16, 0}}}, - {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}, + {S("register"), {{2, 0}, {4, 0}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, {S("warp"), {}}, {S("block"), {}}, }, @@ -154,8 +194,8 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_withWarpOperandA) { LinearLayout( { {S("register"), - {{0, 1}, {4, 0}, {8, 0}, {16, 0}, {0, 16}, {0, 32}}}, - {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}, + {{2, 0}, {4, 0}, {8, 0}, {16, 0}, {0, 16}, {0, 32}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, {S("warp"), {{0, 0}, {32, 0}}}, {S("block"), {}}, },