@@ -768,7 +768,11 @@ struct AtomicRMWOpConversion
768768 // tensor
769769 if (tensorTy) {
770770 auto valTy = cast<RankedTensorType>(val.getType ());
771- vec = std::min<unsigned >(vec, valTy.getElementType ().isF16 () ? 2 : 1 );
771+ Type elTy = valTy.getElementType ();
772+ vec = std::min<unsigned >(vec, llvm::isa<FloatType>(elTy) &&
773+ elTy.getIntOrFloatBitWidth () == 16
774+ ? 2
775+ : 1 );
772776 // mask
773777 numElems = tensorTy.getNumElements ();
774778 }
@@ -783,13 +787,22 @@ struct AtomicRMWOpConversion
783787 auto vecTy = vec_ty (valueElemTy, vec);
784788 auto retType = vec == 1 ? valueElemTy : vecTy;
785789 SmallVector<Value> resultVals (elemsPerThread);
786- const bool f16v2 = vec == 2 && valueElemTy.isF16 ();
787790 for (size_t i = 0 ; i < elemsPerThread; i += vec) {
788791 Value rmwPtr = ptrElements[i];
789792 // TODO: in case llMask is zero we can create only one branch for all
790793 // elemsPerThread.
791794 Value rmwMask = llMask ? and_ (mask, maskElements[i]) : mask;
792795
796+ Value operand;
797+ if (vec == 1 ) {
798+ operand = valElements[i];
799+ } else {
800+ operand = undef (vecTy);
801+ for (size_t ii = 0 ; ii < vec; ++ii)
802+ operand =
803+ insert_element (vecTy, operand, valElements[i + ii], i32_val (ii));
804+ }
805+
793806 Value undefVal = undef (retType);
794807 // Build blocks to bypass the atomic instruction for ~rmwMask.
795808 auto *curBlock = rewriter.getInsertionBlock ();
@@ -806,25 +819,11 @@ struct AtomicRMWOpConversion
806819 auto maybeKind = matchAtomicOp (atomicRmwAttr);
807820 // TODO: use rocdl.raw.buffer.atomic from ROCDL dialect to use efficient
808821 // atomics for MI-* series of AMD GPU.
809- Value atom = rewriter
810- .create <LLVM::AtomicRMWOp>(
811- loc, *maybeKind, rmwPtr, valElements[i],
812- atomicMemOrdering, StringRef (" agent" ))
813- .getResult ();
814-
815- // NV for the f16v2 case generates one packed instruction. We have to
816- // create two separate instructions since LLVM::AtomicRMWOp doesn't
817- // support this. Can be optimized out with rocdl.raw.buffer.atomic.
818- if (f16v2) {
819- Value atom2 =
820- rewriter
821- .create <LLVM::AtomicRMWOp>(
822- loc, *maybeKind, ptrElements[i + 1 ], valElements[i + 1 ],
823- atomicMemOrdering, StringRef (" agent" ))
824- .getResult ();
825- auto tmp = insert_element (vecTy, undef (vecTy), atom, i32_val (0 ));
826- atom = insert_element (vecTy, tmp, atom2, i32_val (1 )).getResult ();
827- }
822+ Value atom =
823+ rewriter
824+ .create <LLVM::AtomicRMWOp>(loc, *maybeKind, rmwPtr, operand,
825+ atomicMemOrdering, StringRef (" agent" ))
826+ .getResult ();
828827 if (!tensorTy) {
829828 if (atomicNeedsSharedMemory (op.getResult ())) {
830829 Value atomPtr =
0 commit comments