diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp index ee4534180b34..fa11745645ad 100644 --- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp @@ -169,6 +169,19 @@ struct JoinOpConversion : public ConvertOpToLLVMPattern { assert(lhsVals.size() == rhsVals.size()); SmallVector joinedVals; joinedVals.resize(lhsVals.size() * 2); + + // Specifically for packed upcasting from 4b to 16b dtypes + // numContiguousValues cannot be too large, since the two outputs of + // inline_asm contain interleaved values OTOH, if numContiguousValues * 16b + // < 32b, then we'll need to rearrange 16b values in 32b registers. Hnece we + // set numContiguousValues to 2 + auto inlineOp = + dyn_cast(op.getLhs().getDefiningOp()); + if (inlineOp && inlineOp.getPackedElement() == 4 && + dstTy.getElementTypeBitWidth() == 16) { + numContiguousValues = 2; + } + for (int i = 0; i < lhsVals.size(); i += numContiguousValues) { for (int j = 0; j < numContiguousValues; j++) { joinedVals[2 * i + j] = lhsVals[i + j]; diff --git a/python/src/llvm.cc b/python/src/llvm.cc index 222ff3f8f9fc..f1d976ed5425 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -59,7 +59,7 @@ createTargetMachine(llvm::Module *module, std::string proc, opt.MCOptions.AsmVerbose = true; opt.MCOptions.PreserveAsmComments = true; std::unique_ptr machine{target->createTargetMachine( - module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, + module->getTargetTriple().str(), proc, features, opt, llvm::Reloc::PIC_, std::nullopt, disableLLVMOpt ? llvm::CodeGenOptLevel::None : llvm::CodeGenOptLevel::Aggressive)};