@@ -419,6 +419,52 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
419419 }
420420};
421421
422+ // TODO: AMDGPU backend already have all this bitpacking logic, we should move
423+ // it to some common place.
424+ static FailureOr<unsigned > encodeWaitcnt (Chipset chipset, unsigned vmcnt,
425+ unsigned expcnt, unsigned lgkmcnt) {
426+ if (chipset.majorVersion == 9 ) {
427+ vmcnt = std::min (63u , vmcnt);
428+ expcnt = std::min (7u , expcnt);
429+ lgkmcnt = std::min (15u , lgkmcnt);
430+ unsigned lowBits = vmcnt & 0xF ;
431+ unsigned highBits = (vmcnt >> 4 ) << 14 ;
432+ unsigned otherCnts = (expcnt << 4 ) | (lgkmcnt << 8 );
433+ return lowBits | highBits | otherCnts;
434+ }
435+ return failure ();
436+ }
437+
438+ struct WaitcntOpLowering : public ConvertOpToLLVMPattern <WaitcntOp> {
439+ WaitcntOpLowering (const LLVMTypeConverter &converter, Chipset chipset)
440+ : ConvertOpToLLVMPattern<WaitcntOp>(converter), chipset(chipset) {}
441+
442+ Chipset chipset;
443+
444+ LogicalResult
445+ matchAndRewrite (WaitcntOp op, OpAdaptor adaptor,
446+ ConversionPatternRewriter &rewriter) const override {
447+ auto getVal = [](Attribute attr) -> unsigned {
448+ if (attr)
449+ return cast<IntegerAttr>(attr).getInt ();
450+
451+ // This value will be clamped to the maximum value for the chipset.
452+ return 1024 * 1024 ;
453+ };
454+ unsigned vmcnt = getVal (adaptor.getVmcntAttr ());
455+ unsigned expcnt = getVal (adaptor.getExpcntAttr ());
456+ unsigned lgkmcnt = getVal (adaptor.getLgkmcntAttr ());
457+
458+ FailureOr<unsigned > waitcnt =
459+ encodeWaitcnt (chipset, vmcnt, expcnt, lgkmcnt);
460+ if (failed (waitcnt))
461+ return op.emitOpError (" unsupported chipset" );
462+
463+ rewriter.replaceOpWithNewOp <ROCDL::SWaitcntOp>(op, *waitcnt);
464+ return success ();
465+ }
466+ };
467+
422468struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern <LDSBarrierOp> {
423469 LDSBarrierOpLowering (const LLVMTypeConverter &converter, Chipset chipset)
424470 : ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -1825,9 +1871,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
18251871 ROCDL::RawPtrBufferAtomicUminOp>,
18261872 RawBufferOpLowering<RawBufferAtomicCmpswapOp,
18271873 ROCDL::RawPtrBufferAtomicCmpSwap>,
1828- AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering ,
1829- MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering ,
1830- ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
1874+ AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering ,
1875+ SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering ,
1876+ WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
18311877 PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
18321878 PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
18331879 TransposeLoadOpLowering>(converter, chipset);
0 commit comments